Load packages and define wd
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(ggpubr)
library(ggExtra)
library(reshape2)
library(knitr)
library(kableExtra)
library(vegan)
library(PerformanceAnalytics)
library(ComplexHeatmap)
library(RColorBrewer)
library(DT)
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.2
library(rstatix)
## Warning: package 'rstatix' was built under R version 3.6.2
library(tidyverse)
library(broom)
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=50),tidy=TRUE, message=FALSE)
#setwd("/Users/tkq300/Desktop/mac_data/ProxARG/early_sim_fil/")
setwd("/Users/tkq300/Desktop/mac_data/ProxARG/final_analysis4/")
knitr::opts_knit$set(root.dir = "/Users/tkq300/Desktop/mac_data/ProxARG/final_analysis4/")
#Fixed color scale for mechanisms
colours = c("Antibiotic efflux" = "#A6CEE3", "Antibiotic inactivation" = "#1F78B4", "Antibiotic target alteration" = "#B2DF8A", "Antibiotic target protection" = "#33A02C", "Antiobiotic target replacement" = "#FB9A99")
Lengths of transposons from the Transposon Registry (Tansirichaiya et al., 2019)
# Import and reformat data
tnp_reg = read.csv("tnp_registry_supp2.csv", sep = ";",
dec = ",")
tnp_reg$Length = gsub(",", ".", tnp_reg$Length)
tnp_reg$Length = as.numeric(as.character(tnp_reg$Length))
tnp_reg2 = subset(tnp_reg, Type == "Unit transposon" |
Type == "Composite transposon")
p = ggplot(tnp_reg2, aes(Type, Length, color = Type)) +
geom_violin() + geom_jitter() + scale_y_continuous(breaks = seq(0,
80, 5)) + geom_hline(yintercept = 12.17) + theme_light() +
theme(legend.position = "none") + geom_hline(yintercept = 24.34,
linetype = "dashed") + ylab("Length (kbp)")
ggExtra::ggMarginal(p, type = "histogram", margins = "y")

# At 10 kbp, what percentage of composite and unit
# transposons do we include?
tnp_reg3 = subset(tnp_reg2, Length <= 24.34)
nrow(tnp_reg3)/nrow(tnp_reg2) * 100
## [1] 77.72829
# What is the mean?
mean(tnp_reg2$Length, na.rm = T)
## [1] 12.1732
# 12.17 kbp
median(tnp_reg2$Length, na.rm = T)
## [1] 8.2
The distance between a usually unmobilized housekeeping gene, 16S rRNA, and closest IS element
Since the 16S rRNA gene should only extremely rarely be mobilized (and fixated) by IS elements, we can use the 16S:IS distance to validate that we are searching within a reasonable distance of ARG:IS associations
# Analyse 16S distance results
dist_16S = read.csv("16S_IS_distance", sep = " ", header = F)
colnames(dist_16S) = c("Region", "IS_Start", "IS_End",
"IS_element", "16S_Start", "16S_end", "Distance")
# A few IS elements apparently overlap with 16S,
# resulting in negative distances. Remove these.
dist_16S = dist_16S[which(dist_16S$Distance > 1), ]
dist_16S_only = as.data.frame(dist_16S$Distance)
# For all those 16S regions without IS elements
# within 100kb in either direction, add them as
# 100kb (more than)
long_dist16 = as.data.frame(rep(1e+05, 39174))
colnames(long_dist16) = "Distance"
colnames(dist_16S_only) = "Distance"
dist_16S_with_100kb = rbind(dist_16S_only, long_dist16)
# Ignore hits that do not have IS within 100 kbp
dist_16S_only$Gene = "16S"
ggplot(dist_16S_only, aes(Gene, Distance)) + geom_violin() +
theme_light() + geom_hline(yintercept = 12170) +
labs(y = "Distance to IS element (bp)", x = "")

# How many 16S genes have IS elements within 12.17
# kbp?
nrow(subset(dist_16S_only, Distance <= 12170))
## [1] 4480
# If you include those that do not have IS elements
# within 100k, the numbers are How many 16S genes
# have IS elements within 12.17 kbp?
nrow(subset(dist_16S_with_100kb, Distance <= 12170))
## [1] 4480
# And in percentage
nrow(subset(dist_16S_with_100kb, Distance <= 12170))/nrow(dist_16S_with_100kb) *
100
## [1] 5.590147
Investigate database biases
# Investigate database bias in taxonomy composition
# before identifying ARGs and clustering
p_tax = read.csv("plasmid_tax", header = F, sep = "\t")
colnames(p_tax) = c("Acc.", "Superkingdom", "Kingdom",
"Phylum", "Class", "Order", "Family", "Genus")
c_tax = read.csv("chr_tax", header = F, sep = "\t")
colnames(c_tax) = c("Acc.", "Superkingdom", "Kingdom",
"Phylum", "Class", "Order", "Family", "Genus")
c_tax$type = as.character("Chr")
p_tax$type = as.character("Plasmid")
both_tax = rbind(c_tax, p_tax)
# Calculate frequencies of groups and remove those
# making up less than 0.5% of total data
both_tax2 = both_tax %>% group_by(Class, Order, type) %>%
tally() %>% mutate(freq = (n/sum(n)) * 100)
both_tax2 = both_tax2[which(both_tax2$freq > 0.5),
]
# The below figure (Supplemental figure S7) does
# not format neatly in this Rmarkdown report. The
# code below has been commented out but was used to
# produce figure S7 ggplot(both_tax2, aes(x =
# Order, y = freq, fill = type)) + geom_bar(stat =
# 'identity',position = 'dodge') + ylab('Relative
# frequencies') + facet_wrap(~ Class, nrow = 2,
# drop = T, scales = 'free_x') + theme_light() +
# theme(axis.text.x = element_text(angle = 45,
# hjust = 1), strip.text.x = element_text(angle =
# 45))
# CARD db taxonomy was found with the below command
# in linux terminal: fasta_formatter -i
# protein_fasta_protein_homolog_model.fasta -t |
# grep -o '\[.*\]' | awk '{print $1}' | sed -e
# 's/\[//' -e 's/\]//' | sort | uniq -c | sed -e
# 's/^ *//' > protein_homolog.tax
card_tax = read.csv("protein_homolog.tax", header = F,
sep = " ")
colnames(card_tax) = c("n", "Tax")
card_tax$freq = card_tax$n/sum(card_tax$n) * 100
card_tax$db = "CARD"
card_tax2 = card_tax[which(card_tax$freq > 0.5), ]
# Import taxonomy from refseq complete genomes.
# Since multiple replicon may occur per strain
# entry in database, only the first and largest
# chromosome was selected here (selected with bash
# functions in UNIX)
refseq_tax = read.csv("refseq_1st_chr.tax", header = F,
sep = "\t")
colnames(refseq_tax) = c("Acc.", "TaxID", "Superkingdom",
"Kingdom", "Phylum", "Class", "Order", "Family",
"Genus")
refseq_tax_freq = as.data.frame(refseq_tax %>% count(Phylum,
Class, Order, Family, Genus) %>% mutate(freq = (n/sum(n)) *
100))
refseq_tax_freq = refseq_tax_freq %>% group_by(Order) %>%
mutate(order_freq = sum(freq)) %>% filter(order_freq >
0.5)
ggplot(refseq_tax_freq, aes(Order, order_freq)) + geom_bar(stat = "identity",
position = "dodge") + facet_wrap(~Class, scales = "free_x",
nrow = 2) + theme_light() + theme(axis.text.x = element_text(angle = 90,
hjust = 1)) + labs(y = "Relative frequency") +
scale_fill_brewer(palette = "Paired")

refseq_tax2 = as.data.frame(refseq_tax %>% group_by(Genus) %>%
tally() %>% mutate(freq = (n/sum(n)) * 100))
refseq_tax2$db = "RefSeq"
colnames(refseq_tax2) = c("Tax", "n", "freq", "db")
refseq_tax2 = refseq_tax2[, c(2, 1, 3, 4)]
# Merge CARD and RefSeq taxonomies for comparison
refseq_tax2_card = refseq_tax2[refseq_tax2$Tax %in%
card_tax2$Tax, ]
card_refseq_tax_sub = rbind(refseq_tax2_card, card_tax2)
# Plot abundance frequencies for CARD and RefSeq
# databases. Figure not used in main article or
# supplemental information
ggplot(card_refseq_tax_sub, aes(x = Tax, y = freq,
fill = db)) + geom_bar(stat = "identity", position = "dodge") +
ylab("Relative frequencies") + theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.x = element_text(angle = 45))

refseq_tax2_card2 = refseq_tax2[refseq_tax2$Tax %in%
card_tax$Tax, ]
refseq_tax2_card3 = card_tax[card_tax$Tax %in% refseq_tax2_card2$Tax,
]
colnames(refseq_tax2_card2) = c("Rn", "RTax", "Rfreq",
"Rdb")
colnames(refseq_tax2_card3) = c("Cn", "CTax", "Cfreq",
"Cdb")
common_tax = cbind(refseq_tax2_card3, refseq_tax2_card2)
# Calculate the difference in relative abundance
# between the two databases
common_tax$diff = as.numeric(as.character(common_tax$Cfreq)) -
as.numeric(as.character(common_tax$Rfreq))
# Select Genera with a relative difference larger
# than 0.5%
common_tax2 = common_tax[which(abs(common_tax$diff) >
0.5), ]
common_tax2$ndiff = common_tax2$Cn - common_tax2$Rn
for_kable = common_tax2 %>% select(2, 1, 3, 5, 7, 10,
9)
colnames(for_kable) = c("Genus", "#Card", "%Card",
"#RefSeq", "%RefSeq", "#Diff.", "%Diff.")
kable(for_kable)
|
|
Genus
|
#Card
|
%Card
|
#RefSeq
|
%RefSeq
|
#Diff.
|
%Diff.
|
|
3
|
Acinetobacter
|
416
|
15.8536585
|
259
|
1.6403825
|
157
|
14.2132760
|
|
6
|
Aeromonas
|
41
|
1.5625000
|
63
|
0.3990120
|
-22
|
1.1634880
|
|
12
|
Bacillus
|
33
|
1.2576220
|
630
|
3.9901197
|
-597
|
-2.7324978
|
|
15
|
Bifidobacterium
|
3
|
0.1143293
|
138
|
0.8740262
|
-135
|
-0.7596970
|
|
16
|
Bordetella
|
1
|
0.0381098
|
746
|
4.7248084
|
-745
|
-4.6866987
|
|
18
|
Brachyspira
|
15
|
0.5716463
|
9
|
0.0570017
|
6
|
0.5146446
|
|
21
|
Brucella
|
1
|
0.0381098
|
123
|
0.7790234
|
-122
|
-0.7409136
|
|
22
|
Burkholderia
|
13
|
0.4954268
|
273
|
1.7290519
|
-260
|
-1.2336250
|
|
25
|
Campylobacter
|
28
|
1.0670732
|
275
|
1.7417189
|
-247
|
-0.6746457
|
|
29
|
Chryseobacterium
|
23
|
0.8765244
|
49
|
0.3103426
|
-26
|
0.5661817
|
|
30
|
Citrobacter
|
114
|
4.3445122
|
80
|
0.5066819
|
34
|
3.8378303
|
|
33
|
Corynebacterium
|
5
|
0.1905488
|
248
|
1.5707138
|
-243
|
-1.3801650
|
|
37
|
Elizabethkingia
|
19
|
0.7240854
|
31
|
0.1963392
|
-12
|
0.5277461
|
|
39
|
Enterobacter
|
88
|
3.3536585
|
135
|
0.8550257
|
-47
|
2.4986329
|
|
42
|
Enterococcus
|
86
|
3.2774390
|
196
|
1.2413706
|
-110
|
2.0360684
|
|
44
|
Escherichia
|
364
|
13.8719512
|
1018
|
6.4475268
|
-654
|
7.4244245
|
|
55
|
Helicobacter
|
1
|
0.0381098
|
195
|
1.2350371
|
-194
|
-1.1969273
|
|
59
|
Klebsiella
|
427
|
16.2728659
|
502
|
3.1794287
|
-75
|
13.0934371
|
|
61
|
Lactobacillus
|
1
|
0.0381098
|
405
|
2.5650770
|
-404
|
-2.5269672
|
|
64
|
Legionella
|
4
|
0.1524390
|
120
|
0.7600228
|
-116
|
-0.6075838
|
|
65
|
Listeria
|
7
|
0.2667683
|
216
|
1.3680410
|
-209
|
-1.1012727
|
|
67
|
Mannheimia
|
1
|
0.0381098
|
90
|
0.5700171
|
-89
|
-0.5319073
|
|
73
|
Mycobacterium
|
6
|
0.2286585
|
264
|
1.6720502
|
-258
|
-1.4433916
|
|
77
|
Neisseria
|
8
|
0.3048780
|
185
|
1.1717018
|
-177
|
-0.8668238
|
|
92
|
Proteus
|
47
|
1.7911585
|
33
|
0.2090063
|
14
|
1.5821523
|
|
94
|
Pseudomonas
|
255
|
9.7179878
|
550
|
3.4834378
|
-295
|
6.2345500
|
|
102
|
Salmonella
|
63
|
2.4009146
|
827
|
5.2378238
|
-764
|
-2.8369092
|
|
103
|
Serratia
|
30
|
1.1432927
|
94
|
0.5953512
|
-64
|
0.5479415
|
|
107
|
Staphylococcus
|
73
|
2.7820122
|
619
|
3.9204509
|
-546
|
-1.1384388
|
|
110
|
Streptococcus
|
17
|
0.6478659
|
646
|
4.0914561
|
-629
|
-3.4435902
|
|
111
|
Streptomyces
|
54
|
2.0579268
|
187
|
1.1843689
|
-133
|
0.8735580
|
# Red = more abundant in CARD than RefSeq Select
# the most extreme cases for highlighting
Genus = c("Acinetobacter", "Klebsiella", "Escherichia",
"Salmonella", "Streptococcus", "Bordetella")
CARD_abund = c("15.85%", "16.27%", "13.87%", "2.40%",
"0.65%", "0.04%")
RefSeq_adund = c("1.62%", "3.18%", "6.45%", "5.24%",
"4.09%", "4.72%")
df_table = data.frame(CARD_abund, RefSeq_adund)
rownames(df_table) = Genus
colnames(df_table) = c("CARD\nAbundance", "RefSeq\nAbundance")
# Plot genera with difference relative abundance
# between the databases
ggplot(common_tax2, aes(x = reorder(CTax, -diff), diff,
fill = Cn)) + geom_bar(stat = "identity") + theme_light() +
scale_fill_continuous(high = "red", low = "blue",
name = "CARD abundance") + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + labs(x = "Genus", y = "Relative abundance difference") +
annotation_custom(tableGrob(df_table, theme = ttheme_default(base_size = 8)),
xmin = -10, xmax = 50, ymin = 3, ymax = 14)

Distribution of ARG DIAMOND hits passing either RGI filter or custom >80% ID (and >80% query coverage)
The RGI filter is a manually curated bitscore cutoff per ARO category. These numbers appear very arbitrary and rounded off considering they are bitscore values. An amino acid sequence might for example have a bitscore of 1399 to an ARO reference protein, but the curated RGI cutoff is set to 1400. Obviously, this query amino acid sequence represents a very good hit, but it just fails to meet the RGI bitscore cutoff. For cases like this, the >80% ID filter was included. The code below makes a figure that represents hits passing and failing the two cutoffs. For both filters, a minimum of 80% query coverage was set.
# Import diamond hits that either have a higher
# bitscore than RGI cutoff or are at least 80%
# similar. (default query/subject coverage is 80%)
# These files were created with simple parsing of
# DIAMOND hits in bash
passed_rgi = read.csv("diamond_hits_rgi_cutoff.pass",
header = F, sep = "\t")
failed_rgi = read.csv("diamond_hits_rgi_cutoff.fail",
header = F, sep = "\t")
# The files are very large - subset to 10000 random
# hits for both passing and failing hits
nrow(passed_rgi)
## [1] 176888
nrow(failed_rgi)
## [1] 1164572
passed_rgi_sub = sample_n(passed_rgi, 10000)
failed_rgi_sub = sample_n(failed_rgi, 10000)
# Combine passed and failed to one data frame
both_rgi_sub = rbind(passed_rgi_sub, failed_rgi_sub)
p = ggplot(both_rgi_sub, aes(V3, V15, color = V13)) +
xlim(0, 100) + geom_rect(xmin = 0, xmax = 100,
ymin = 0, ymax = 3, fill = "#99FF66") + geom_rect(xmin = 80,
xmax = 100, ymin = 0, ymax = 1, fill = "#FFA500") +
geom_rect(xmin = 0, xmax = 80, ymin = 0, ymax = 1,
fill = "#FF6633") + geom_point(alpha = 0.6) +
theme_bw() + labs(x = "% ID", y = "Bitratio", color = "% Qcov")
ggExtra::ggMarginal(p + theme(legend.position = "left"),
type = "histogram")

# Investigate the taxonomy of hits passing either
# RGI score or > 80% ID
rgi_pass = read.csv2("rgi_pass.tax", sep = "\t", header = F)
ID_pass = read.csv2("rgi_fail_ID_pass.tax", sep = "\t",
header = F)
rgi_pass$filter = "RGI bitscore"
ID_pass$filter = "ID > 80%"
both_pass = rbind(rgi_pass, ID_pass)
colnames(both_pass) = c("Superkingdom", "Kingdom",
"Phylum", "Class", "Order", "Family", "Genus",
"Filter")
both_pass2 = both_pass %>% count(Phylum, Class, Order,
Filter) %>% mutate(freq = (n/sum(n) * 100))
major_orders = both_pass2 %>% filter(freq > 0.2)
ggplot(major_orders, aes(Order, freq, fill = Filter)) +
geom_bar(stat = "identity", position = "dodge") +
facet_grid(~Class, scales = "free_x") + theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(y = "Relative frequency") + scale_fill_brewer(palette = "Paired")

minor_orders = both_pass2 %>% filter(freq < 0.2) %>%
filter(n > 10)
ggplot(minor_orders, aes(Order, freq, fill = Filter)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~Class, scales = "free_x", nrow = 2) +
theme_light() + theme(axis.text.x = element_text(angle = 90,
hjust = 1)) + labs(y = "Relative frequency") +
scale_fill_brewer(palette = "Paired")

Inspect the impact of clustering to CRLs
dist_data=read.csv("ARG_IS_distances_mech.txt",sep = "\t",header=F)
colnames(dist_data) = c("Region","ARO","Updist","Downdist","IS_Up","IS_Down","Replicon",
"Sim_cutoff","Mean_dist","Cluster_size","Mechanism","Submechanism")
dist_data$Cluster_size = gsub("size_","",dist_data$Cluster_size)
#For each region with an ARG, there can be up to two closest IS element: upstream and downstream. The distance to these are denoted as Updist and Downdist, while the Mean_dist column is the mean of the two. A distance of 0 indicates that no IS element was found.
#Cluster_size refers to how many individual regions were clustered to form a given CRL Region.
dist_data2 = dist_data
dist_data2$ARO = gsub("ARO:","",dist_data2$ARO)
#Make a new dataframe where various metric are calculated and summarized
all_aros = unique(as.vector(dist_data2$ARO))
df <- data.frame(ARO=character(),
Occurrences=numeric(),
Ratio=numeric(),
Mean_distance=numeric(),
SD_distance=numeric(),
Mechanism=character(),
Submechanism=character(),
Zero_IS=numeric(),
Replicon=character(),
Total_seqs=numeric())
for (ARO in all_aros){
temp <- dist_data2[which(dist_data2$ARO==ARO), ]
Zero_IS = nrow(temp[which(temp$Mean_dist == 0),])
number_nonzero = nrow(temp[which(temp$Mean_dist > 0),])
Ratio = number_nonzero/nrow(temp)
Occurrences = nrow(temp)
temp_nonzero = temp[which(temp$Mean_dist > 0),]
Mean_distance = mean(temp_nonzero$Mean_dist)
SD_distance = sd(temp_nonzero$Mean_dist)
Mechanism = unique(as.vector(temp$Mechanism))
IS_Up = unique(as.vector(temp$IS_Up))
IS_Down = unique(as.vector(temp$IS_Down))
Submechanism = unique(as.vector(temp$Submechanism))
p_rep = nrow(temp[which(temp$Replicon == "Plasmid"),])
c_rep = nrow(temp[which(temp$Mean_dist == "Chr"),])
Replicon = p_rep/nrow(temp)
Total_seqs = sum(as.numeric(temp$Cluster_size))
temp2 = as.data.frame(cbind(ARO,Occurrences,Ratio,Mean_distance,SD_distance,Mechanism,Submechanism,Zero_IS,Replicon,Total_seqs))
df = rbind(df,temp2)
}
#Make sure that values in columns have a correct (numeric) format
df$Occurrences = as.numeric(as.character(df$Occurrences))
df$Mean_distance = as.numeric(as.character(df$Mean_distance))
df$SD_distance = as.numeric(as.character(df$SD_distance))
df$Ratio = as.numeric(as.character(df$Ratio))
df$Zero_IS = as.numeric(as.character(df$Zero_IS))
df$Replicon = as.numeric(as.character(df$Replicon))
df$Total_seqs = as.numeric(as.character(df$Total_seqs))
#Check the number of CRLs and AROs
dist_data %>%
group_by(Mechanism) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 9 x 2
## Mechanism n
## <fct> <int>
## 1 antibiotic_efflux 28953
## 2 antibiotic_inactivation 13763
## 3 antibiotic_target_alteration 4856
## 4 antibiotic_target_replacement 3360
## 5 antibiotic_target_protection 1428
## 6 antibiotic_efflux;reduced_permeability_to_antibiotic 962
## 7 antibiotic_target_alteration;antibiotic_target_replacement 276
## 8 antibiotic_target_alteration;antibiotic_efflux 155
## 9 reduced_permeability_to_antibiotic 142
unique(df) %>%
group_by(Mechanism) %>%
tally() %>%
arrange(desc(n))
## # A tibble: 9 x 2
## Mechanism n
## <fct> <int>
## 1 antibiotic_inactivation 718
## 2 antibiotic_efflux 225
## 3 antibiotic_target_alteration 128
## 4 antibiotic_target_protection 58
## 5 antibiotic_target_replacement 39
## 6 reduced_permeability_to_antibiotic 3
## 7 antibiotic_efflux;reduced_permeability_to_antibiotic 2
## 8 antibiotic_target_alteration;antibiotic_target_replacement 2
## 9 antibiotic_target_alteration;antibiotic_efflux 1
#Plot the "compression" of CRLs: How many individual were clustered into the respective CRLs.
p = ggplot(df, aes(x = Total_seqs, y = Occurrences, color = 100-(Occurrences/Total_seqs)*100)) +
geom_point(position = position_jitter(width = 0.05,height = 0.01)) +
scale_y_log10() +
scale_x_log10() +
theme_light() +
theme(legend.position=c(.2,.7),
legend.key.size = unit(0.4, "cm"),
legend.background = element_rect(colour = "black")) +
labs(x ="Total unclustered regions",y = "Number of CRLs", color = "% CRL\ncompression") +
scale_color_gradient(high = "red",low = "blue") +
ggtitle("B: CRL compression")
pcompress1 = print(ggMarginal(p, type = "violin"))

pcompress1
## NULL
#Also make an interactive version of the plot
mytext = paste("ARO = ",df$ARO,"\n","Total unclustered regions = ",df$Total_seqs,"\n","Number of CRLs = ",df$Occurrences,"\n","Mechanism = ",df$Mechanism)
pp = plotly_build(p)
style(pp, text = mytext, hoverinfo = "text")
df2 = df
#Rename mechanisms
df2$Mechanism = gsub("antibiotic_efflux","Antibiotic efflux",df2$Mechanism)
df2$Mechanism = gsub("antibiotic_target_replacement","Antibiotic target replacement",df2$Mechanism)
df2$Mechanism = gsub("antibiotic_target_alteration","Antibiotic target alteration",df2$Mechanism)
df2$Mechanism = gsub("antibiotic_inactivation","Antibiotic inactivation",df2$Mechanism)
df2$Mechanism = gsub("reduced_permeability_to_antibiotic","Reduced permeability to antibiotic",df2$Mechanism)
df2$Mechanism = gsub("antibiotic_target_protection","Antibiotic target protection",df2$Mechanism)
#Test CRL compression per mechanism. Select only major mechanisms for testing
df3 = df2 %>%
filter(Mechanism == "Antibiotic efflux" |
Mechanism == "Antibiotic target replacement" |
Mechanism == "Antibiotic target alteration" |
Mechanism == "Antibiotic inactivation" |
Mechanism == "Antibiotic target protection")
df3$Compression = 100-(df3$Occurrences/df3$Total_seqs)*100
#Perform pairwise Mann-Whitney tests (unpaired Wilcoxon Rank Sum) on the compression rates
comp_means = compare_means(Compression ~ Mechanism, data = df3, method = "wilcox.test",paired = F,p.adjust.method = "fdr") %>%
#mutate(y_pos = 2) %>%
filter(p.adj < 0.05)
#If plyr is loaded after dplyr it messes up some dplyr functions.
#detach(package:plyr)
means = df3 %>%
group_by(Mechanism) %>%
drop_na(Compression) %>%
summarise(mean_compression = mean(Compression))
colnames(means) = c("group1","mean1")
comp_means = comp_means %>%
full_join(means, by = "group1") %>%
drop_na(p) %>%
arrange(desc(mean1))
colnames(means) = c("group2","mean2")
comp_means = comp_means %>%
full_join(means, by = "group2") %>%
drop_na(p) %>%
arrange(desc(mean1),desc(mean2))
comp_means$y_pos = seq(1, 0.9+nrow(comp_means)*10, 10)
comp_means$y_pos = comp_means$y_pos+100
for(i in 1:nrow(comp_means)){
if (comp_means$p.adj[i] < 0.0001){
comp_means$p.signif[i]="****"}
else if (comp_means$p.adj[i] < 0.001){
comp_means$p.signif[i]="***"}
else if (comp_means$p.adj[i] < 0.01){
comp_means$p.signif[i]="**"}
else if (comp_means$p.adj[i] < 0.05){
comp_means$p.signif[i]="*"}
else if(comp_means$p.adj[i] > 0.05){
comp_means$p.signif[i]="ns"}
}
pcompress2 = ggplot(df3, aes(x = reorder(Mechanism, -(100-(Occurrences/Total_seqs))*100, FUN = mean),
y = 100-(Occurrences/Total_seqs)*100, color = Mechanism)) +
geom_boxplot(outlier.shape = NA) +
stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
width = 0.75, size = 1.5, linetype = "dashed", color = "blue") +
geom_jitter(size =0.5) +
theme_light() +
ggsignif::geom_signif(data=comp_means, aes(xmin=group1, xmax=group2, annotations=p.signif, y_position=y_pos), manual=T, color = "black", tip_length = 0,vjust=0.5) +
theme(axis.text.x = element_text(angle = 45,hjust = 1), legend.position = "none") +
labs(y = "CRL compression", x = "Mechanism") +
ggtitle("C: Compression rate per mechanism") +
scale_y_continuous(labels= seq(0,100,25), breaks= seq(0,100,25), limits = c(0,140))
pcompress2

#Import taxonomy data to see how clustering affect the relative abundance of genera
aro_tax = read.csv("aro_tax.Rdat",sep="\t",header =F)
colnames(aro_tax) = c("Region","ARO","Up_dist","Down_dist","IS_up","IS_down",
"Replicon","Sim_cutoff","Mean_dist","Cluster_size","Mechanism","Submechanism","Acc.","Superkingdom",
"Kingdom","Phylum","Class","Order","Family","Genus")
aro_tax$Superkingdom = as.character(aro_tax$Superkingdom)
aro_tax$ARO = gsub("ARO:","",aro_tax$ARO)
#If superkingdom is empty, remove the line
#aro_tax <- aro_tax[-which(aro_tax$Superkingdom == ""), ]
#Remove plasmids
aro_tax <- aro_tax[-which(aro_tax$Replicon == "Plasmid"), ]
#Compare relative abundance of genera with those in RefSeq complete genome db
refseq_tax = read.csv("refseq_1st_chr.tax", header = F, sep = "\t")
colnames(refseq_tax) = c("Acc.","TaxID","Superkingdom","Kingdom","Phylum","Class","Order","Family","Genus")
clust_c_tax = read.csv("post_clust_tax_c", header = F, sep = "\t")
#Each replicon can have multiple ARGs. Sort-unique the lines to only get one line per replicon
clust_c_uniq = clust_c_tax[!duplicated(clust_c_tax),]
colnames(clust_c_uniq) = c("Acc.","Superkingdom","Kingdom","Phylum","Class","Order","Family","Genus")
clust_c_tax2 = as.data.frame(clust_c_uniq %>%
group_by(Genus) %>%
tally() %>%
mutate(Cfreq = (n/sum(n))*100))
refseq_tax2 = as.data.frame(refseq_tax %>%
group_by(Genus) %>%
tally() %>%
mutate(freq = (n / sum(n))*100))
aro_tax2 = as.data.frame(aro_tax %>%
group_by(Genus) %>%
tally() %>%
#count(Genus) %>%
mutate(freq = (n / sum(n))*100))
colnames(refseq_tax2) = c("RTax","Rn","Rfreq")
colnames(aro_tax2) = c("ATax","An","Afreq")
colnames(clust_c_tax2) = c("CTax","Cn","Cfreq")
#Get only RefSeq entries that also occur in aro_tax
refseq_in_aro = refseq_tax2[refseq_tax2$RTax %in% aro_tax2$ATax,]
refseq_in_clust = refseq_tax2[refseq_tax2$RTax %in% clust_c_tax2$CTax,]
aro_in_refseq = aro_tax2[aro_tax2$ATax %in% refseq_tax2$RTax,]
clust_in_refseq = clust_c_tax2[clust_c_tax2$CTax %in% refseq_tax2$RTax,]
aro_in_refseq = aro_tax2[aro_tax2$ATax %in% refseq_in_aro$RTax,]
clust_in_refseq = clust_c_tax2[clust_c_tax2$CTax %in% refseq_in_aro$RTax,]
common_tax_aro = cbind(refseq_in_aro,aro_in_refseq)
common_tax_clust = cbind(refseq_in_clust,clust_in_refseq)
common_tax_aro$diff = as.numeric(as.character(common_tax_aro$Afreq)) - as.numeric(as.character(common_tax_aro$Rfreq))
common_tax_clust$diff = as.numeric(as.character(common_tax_clust$Cfreq)) - as.numeric(as.character(common_tax_clust$Rfreq))
data_lollipop_aro = as.data.frame(cbind(as.character(common_tax_aro$ATax),
as.numeric(as.character(common_tax_aro$Afreq)),
as.numeric(as.character(common_tax_aro$Rfreq))))
colnames(data_lollipop_aro) = c("Tax","ARG","RefSeq")
data_lollipop_clust = as.data.frame(cbind(as.character(common_tax_clust$CTax),
as.numeric(as.character(common_tax_clust$Cfreq)),
as.numeric(as.character(common_tax_clust$Rfreq))))
colnames(data_lollipop_clust) = c("Tax","Clustered","RefSeq")
data_aro <- data_lollipop_aro %>%
rowwise() %>%
mutate(mymean = rowMeans(cbind(as.numeric(as.character(ARG)),as.numeric(as.character(RefSeq))))) %>%
arrange(mymean) %>%
filter(as.numeric(as.character(ARG)) > 1 | as.numeric(as.character(RefSeq)) > 1) %>%
arrange(desc(mymean)) %>%
mutate(x=factor(Tax, Tax))
data_clust <- data_lollipop_clust %>%
rowwise() %>%
mutate(mymean = rowMeans(cbind(as.numeric(as.character(Clustered)),as.numeric(as.character(RefSeq))))) %>%
arrange(mymean) %>%
filter(as.numeric(as.character(Clustered)) > 1 | as.numeric(as.character(RefSeq)) > 1) %>%
arrange(desc(mymean)) %>%
mutate(x=factor(Tax, Tax))
data_clust_aro = data_clust %>% full_join(data_aro, by = "Tax") %>%
select(1,2,3,4,6,8)
#CARD db taxonomy was found with the linux terminal command:
#fasta_formatter -i protein_fasta_protein_homolog_model.fasta -t | grep -o "\[.*\]" | awk '{print $1}' | sed -e 's/\[//' -e 's/\]//' | sort | uniq -c | sed -e 's/^ *//' > protein_homolog.tax
card_tax = read.csv("protein_homolog.tax", header = F, sep = " ")
colnames(card_tax) = c("n","Tax")
card_tax$freq = card_tax$n/sum(card_tax$n)*100
card_tax$db = "CARD"
card_tax$Tax = as.character(card_tax$Tax)
data_clust_aro_card = as.data.frame(data_clust_aro) %>%
full_join(card_tax, by = "Tax") %>%
select(1,2,3,4,5,6,8) %>%
slice(1:nrow(data_clust_aro))
plolli = ggplot(data_clust_aro_card) +
geom_segment( aes(x=reorder(Tax,mymean.x), xend=reorder(Tax,mymean.x),
y=as.numeric(as.character(Clustered)), yend=as.numeric(as.character(ARG))), color="grey") +
geom_point( aes(x=reorder(Tax,mymean.x), y=as.numeric(as.character(Clustered))), color="forestgreen", size=3, alpha = 0.6 ) +
geom_point( aes(x=reorder(Tax,mymean.x), y=as.numeric(as.character(ARG))), color="blue", size=3, alpha = 0.6 ) +
geom_point( aes(x=reorder(Tax,mymean.x), y=as.numeric(as.character(RefSeq.x))), color="black", size=3, shape = 2 ) +
geom_point( aes(x=reorder(Tax,mymean.x), y=as.numeric(as.character(freq))), color="red", size=3, shape = 2 ) +
coord_flip()+
theme_light() +
theme(axis.text.y = element_text(face = "italic")) +
xlab("") +
ylab("Percentage of data") +
ylim(0.01,35) +
ggtitle("A: Relative abundance of top genera")
#Get the legend only
data_clust_aro_card_melt = melt(data_clust_aro_card,id.vars = "Tax",
measure.vars = c("ARG","Clustered","RefSeq.x","freq"))
legend <- cowplot::get_legend(ggplot(data_clust_aro_card_melt, aes(x=Tax,y=as.numeric(as.character(value)),color = variable, shape = variable)) +
geom_point() +
coord_flip() +
theme_light() +
scale_color_manual(labels = c("Unclustered loci","Clustered loci (CRLs)","RefSeq","CARD"),values = c("blue","forestgreen","black","red")) +
scale_shape_manual(labels = c("Unclustered loci","Clustered loci (CRLs)","RefSeq","CARD"),values = c(16,16,2,2)) +
theme(legend.title = element_blank(),
legend.key.size = unit(0.6, "cm"),
legend.text = element_text(size = 10),
legend.background = element_rect(colour = "black")) +
guides(color = guide_legend(override.aes = list(size=4))))
plolli2 = plolli + annotation_custom(legend,xmin = -20,ymin=10)
plolli2

#Get Euclidean distances
common_tax_aro_df = as.data.frame(cbind(common_tax_aro$Rfreq,common_tax_aro$Afreq))
colnames(common_tax_aro_df) = c("RefSeq freq.","Card freq.")
#Before clustering to CRLs, the euclidean distance between relative abundance of genera found in both databases is:
dist(t(common_tax_aro_df), method = "euclidean")
## RefSeq freq.
## Card freq. 30.89466
common_tax_clust_df = as.data.frame(cbind(common_tax_clust$Rfreq,common_tax_clust$Cfreq))
colnames(common_tax_clust_df) = c("RefSeq freq.","CRL freq.")
#After clustering to CRLs, the euclidean distance is reduced to:
dist(t(common_tax_clust_df), method = "euclidean")
## RefSeq freq.
## CRL freq. 10.26442
#The combined plot is not displayed properly in Rmarkdown, but is called with
#grid.arrange(plolli2,pcompress1, pcompress2, layout_matrix = rbind(c(1,2),c(1,3)))
#What is the accumulated percentage of the datasets?
paste("CRL:",sum(as.numeric(as.character(data_clust_aro_card$Clustered))),"%")
## [1] "CRL: 78.426906069945 %"
paste("Unclustered loci:",sum(as.numeric(as.character(data_clust_aro_card$ARG)),na.rm = T),"%")
## [1] "Unclustered loci: 92.6771590340765 %"
paste("CARD:",sum(as.numeric(as.character(data_clust_aro_card$freq)),na.rm = T),"%")
## [1] "CARD: 81.5167682926829 %"
paste("RefSeq:",sum(as.numeric(as.character(data_clust_aro_card$RefSeq.x)),na.rm = T),"%")
## [1] "RefSeq: 61.6378491354741 %"
Main analyses
# Filter low-occurring entries for plotting
df_filt = df[which(df$Occurrences > 2), ]
# Remove hybrid mechanisms
df_filt = subset(df_filt, !(Mechanism == "antibiotic_target_alteration;antibiotic_target_replacement" |
Mechanism == "antibiotic_target_alteration;antibiotic_efflux" |
Mechanism == "antibiotic_efflux;reduced_permeability_to_antibiotic"))
# Rename major mechanisms
df_filt$Mechanism = gsub("antibiotic_efflux", "Antibiotic efflux",
df_filt$Mechanism)
df_filt$Mechanism = gsub("antibiotic_target_replacement",
"Antibiotic target replacement", df_filt$Mechanism)
df_filt$Mechanism = gsub("antibiotic_target_alteration",
"Antibiotic target alteration", df_filt$Mechanism)
df_filt$Mechanism = gsub("antibiotic_inactivation",
"Antibiotic inactivation", df_filt$Mechanism)
df_filt$Mechanism = gsub("reduced_permeability_to_antibiotic",
"Reduced permeability to antibiotic", df_filt$Mechanism)
df_filt$Mechanism = gsub("antibiotic_target_protection",
"Antibiotic target protection", df_filt$Mechanism)
# Produce figure 4
p = ggplot(df_filt, aes(Ratio, Occurrences, color = Replicon)) +
scale_y_log10() + stat_density_2d(aes(fill = stat(nlevel)),
geom = "polygon", n = 200) + facet_wrap(. ~ Mechanism) +
scale_fill_viridis_c("Scaled\ndensity", option = "E",
alpha = 0.6) + theme_light() + geom_point(alpha = 0.8) +
scale_color_continuous(high = "red", low = "blue") +
labs(color = "Replicon\nratio", y = "Number of unique CRLs in AROs (log10)",
x = "IS ratio")
plotly_build(p)
# Instead of unique CRLs in AROs on Y-axis, plot
# the mean distance (of upstream and downstream ARG
# to IS distance). This produces supplemental
# figure S9 which shows that Antibiotic Efflux
# genes are further distanced from IS elements than
# the other mechanisms.
p_dist_dens = ggplot(df_filt, aes(Ratio, Mean_distance,
color = Replicon)) + stat_density_2d(aes(fill = stat(nlevel)),
geom = "polygon", n = 200) + facet_wrap(. ~ Mechanism) +
scale_fill_viridis_c("Scaled\ndensity", option = "E",
alpha = 0.6) + theme_light() + geom_point(alpha = 0.8) +
scale_color_continuous(high = "red", low = "blue") +
labs(color = "Replicon\nratio", y = "Mean distance to closest IS element (bp)",
x = "IS ratio") + ggtitle("A: Density plots of IS ratio against ARG-IS distance")
# Test the density estimates.
df_filt2 = subset(df, !(Mechanism == "antibiotic_target_alteration;antibiotic_target_replacement" |
Mechanism == "antibiotic_target_alteration;antibiotic_efflux" |
Mechanism == "antibiotic_efflux;reduced_permeability_to_antibiotic"))
df_filt2$Mechanism = gsub("antibiotic_efflux", "Antibiotic efflux",
df_filt2$Mechanism)
df_filt2$Mechanism = gsub("antibiotic_target_replacement",
"Antibiotic target replacement", df_filt2$Mechanism)
df_filt2$Mechanism = gsub("antibiotic_target_alteration",
"Antibiotic target alteration", df_filt2$Mechanism)
df_filt2$Mechanism = gsub("antibiotic_inactivation",
"Antibiotic inactivation", df_filt2$Mechanism)
df_filt2$Mechanism = gsub("reduced_permeability_to_antibiotic",
"Reduced permeability to antibiotic", df_filt2$Mechanism)
df_filt2$Mechanism = gsub("antibiotic_target_protection",
"Antibiotic target protection", df_filt2$Mechanism)
comp_means = compare_means(Mean_distance ~ Mechanism,
data = df_filt2, method = "wilcox.test", paired = F,
p.adjust.method = "fdr") %>% # mutate(y_pos = 2) %>%
filter(p.adj < 0.05)
means = df_filt2 %>% group_by(Mechanism) %>% drop_na(Mean_distance) %>%
summarise(mean_ARGMOB = mean(Mean_distance))
colnames(means) = c("group1", "mean1")
comp_means = comp_means %>% full_join(means, by = "group1") %>%
drop_na(p) %>% arrange(desc(mean1))
colnames(means) = c("group2", "mean2")
comp_means = comp_means %>% full_join(means, by = "group2") %>%
drop_na(p) %>% arrange(desc(mean1), desc(mean2))
comp_means$y_pos = seq(1, 0.9 + nrow(comp_means) *
800, 800)
comp_means$y_pos = comp_means$y_pos + 12500
for (i in 1:nrow(comp_means)) {
if (comp_means$p.adj[i] < 1e-04) {
comp_means$p.signif[i] = "****"
} else if (comp_means$p.adj[i] < 0.001) {
comp_means$p.signif[i] = "***"
} else if (comp_means$p.adj[i] < 0.01) {
comp_means$p.signif[i] = "**"
} else if (comp_means$p.adj[i] < 0.05) {
comp_means$p.signif[i] = "*"
} else if (comp_means$p.adj[i] > 0.05) {
comp_means$p.signif[i] = "ns"
}
}
p_dist_test = ggplot(df_filt2, aes(Mechanism, Mean_distance)) +
geom_boxplot() + ggsignif::geom_signif(data = comp_means,
aes(xmin = group1, xmax = group2, annotations = p.signif,
y_position = y_pos), manual = T, color = "black",
tip_length = 0, vjust = 0.5) + theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(y = "ARG-IS distance (bases)") + scale_y_continuous(labels = seq(0,
12500, 2500), breaks = seq(0, 12500, 2500), limits = c(0,
15000)) + stat_summary(fun.y = mean, geom = "errorbar",
aes(ymax = ..y.., ymin = ..y..), width = 0.75,
size = 1, linetype = "solid", color = "blue") +
ggtitle("B: MWU test for\n ARG-IS distances")
# Produce supplemental figure S9
grid.arrange(p_dist_dens, p_dist_test, layout_matrix = cbind(c(1),
c(2)), widths = c(2, 0.7))

# Investigate distribution of families of IS
# elements Get percentage abundance of all IS
# families per major mechanism
tnp_table2 = dist_data2 %>% gather("key", "value",
IS_Up, IS_Down) %>% group_by(Mechanism, ARO, value) %>%
summarise(n = n()) %>% mutate(freq = n/sum(n) *
100)
tnp_table3 = data.frame(tnp_table2 %>% filter(Mechanism !=
"antibiotic_efflux;reduced_permeability_to_antibiotic") %>%
filter(Mechanism != "antibiotic_target_alteration;antibiotic_efflux") %>%
filter(Mechanism != "antibiotic_target_alteration;antibiotic_target_replacement"))
foo <- data.frame(do.call("rbind", strsplit(as.character(tnp_table3$value),
":", fixed = TRUE)))
colnames(foo) = c("IS_element", "IS_family")
tnp_table3 = cbind(tnp_table3, foo)
# Renme mechanisms
tnp_table3$Mechanism = gsub("antibiotic_efflux", "Antibiotic efflux",
tnp_table3$Mechanism)
tnp_table3$Mechanism = gsub("antibiotic_target_replacement",
"Antibiotic target replacement", tnp_table3$Mechanism)
tnp_table3$Mechanism = gsub("antibiotic_target_alteration",
"Antibiotic target alteration", tnp_table3$Mechanism)
tnp_table3$Mechanism = gsub("antibiotic_inactivation",
"Antibiotic inactivation", tnp_table3$Mechanism)
tnp_table3$Mechanism = gsub("reduced_permeability_to_antibiotic",
"Reduced permeability to antibiotic", tnp_table3$Mechanism)
tnp_table3$Mechanism = gsub("antibiotic_target_protection",
"Antibiotic target protection", tnp_table3$Mechanism)
# Plot only the top 12 IS families (most abundant)
top12_is = tnp_table3 %>% group_by(IS_family) %>% tally() %>%
arrange(desc(n)) %>% head(n = 18) %>% select(1)
top12_is = as.character(top12_is$IS_family)
tnp_table4 = tnp_table3[tnp_table3$IS_family %in% top12_is,
]
# Do Wilcoxon Rank Sum tests (Mann-Whitney) on IS
# families per major mechanism
anno_df2 = compare_means(freq ~ Mechanism, group.by = "IS_family",
data = tnp_table4, method = "wilcox.test", paired = F,
p.adjust.method = "fdr") %>% filter(p.adj < 0.05)
# Convert p values to asterisks indicating
# significance levels
for (i in 1:nrow(anno_df2)) {
if (anno_df2$p.adj[i] < 1e-04) {
anno_df2$p.signif[i] = "****"
} else if (anno_df2$p.adj[i] < 0.001) {
anno_df2$p.signif[i] = "***"
} else if (anno_df2$p.adj[i] < 0.01) {
anno_df2$p.signif[i] = "**"
} else if (anno_df2$p.adj[i] < 0.05) {
anno_df2$p.signif[i] = "*"
} else if (anno_df2$p.adj[i] > 0.05) {
anno_df2$p.signif[i] = "ns"
}
}
# Get a list of the tested IS families. Loop
# through them and make Y-coordinates for plotting
# significance bars.
temp_is = unique(anno_df2$IS_family)
anno_df5 = data.frame(IS_family = character(), .y. = character(),
group1 = character(), group2 = character(), p = character(),
p.adj = character(), p.format = character(), p.signif = character(),
method = character())
for (i in temp_is) {
tempdf = anno_df2 %>% filter(IS_family == i)
tempdf$ypos = head(seq(2.1, 5.1, 0.5), n = nrow(tempdf))
anno_df5 = rbind(anno_df5, tempdf)
}
tnp_table4 = tnp_table3[tnp_table3$IS_family %in% temp_is,
]
# Convert % abundance to log10(%)
tnp_table4$freq_log = log10(tnp_table4$freq)
# Produce supplemental figure S12
is_fam_plot = ggplot(tnp_table4, aes(Mechanism, freq_log)) +
geom_boxplot(outlier.shape = NA) + theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.minor.y = element_blank()) + facet_wrap(~IS_family,
ncol = 6) + ggsignif::geom_signif(data = anno_df5,
aes(xmin = group1, xmax = group2, annotations = p.signif,
y_position = ypos), manual = TRUE, tip_length = 0,
vjust = 0.5) + scale_y_continuous(breaks = c(0,
0.5, 1, 1.5, 2), limits = c(0, 4.8)) + labs(y = "Abundance of IS families in association\n with mechanisms (log10 of %)")
is_fam_plot
## Warning: Removed 3535 rows containing non-finite values (stat_boxplot).

Investigate IS and replicon ratios in different taxonomic groups
# Import data where taxonomic information has been
# included per ARG region
set.seed(1)
aro_tax = read.csv("aro_tax.Rdat", sep = "\t", header = F)
colnames(aro_tax) = c("Region", "ARO", "Up_dist", "Down_dist",
"IS_up", "IS_down", "Replicon", "Sim_cutoff", "Mean_dist",
"Cluster_size", "Mechanism", "Submechanism", "Acc.",
"Superkingdom", "Kingdom", "Phylum", "Class", "Order",
"Family", "Genus")
aro_tax$ARO = gsub("ARO:", "", aro_tax$ARO)
# If superkingdom is empty, remove the line aro_tax
# <- aro_tax[-which(aro_tax$Superkingdom == ''), ]
# Try to group by both phylum and ARO and get mean
# distances
by_aro_tax <- aro_tax %>% group_by(Phylum, ARO)
# Make a summarized data frame.
df_aro_tax = by_aro_tax %>% summarise(mean_dist = mean(Mean_dist),
zeroes = sum(Mean_dist == 0), ratio = sum(Mean_dist >
0)/(sum(Mean_dist == 0) + sum(Mean_dist > 0)),
occurrences = n(), replicon_type = sum(Replicon ==
"Plasmid")/(sum(Replicon == "Plasmid") + sum(Replicon ==
"Chr")))
# make plot of phylums
p_phylum = ggplot(df_aro_tax, aes(x = reorder(Phylum,
-ratio, FUN = mean), ratio)) + geom_jitter(aes(size = occurrences,
color = replicon_type), alpha = 0.6) + geom_boxplot(alpha = 0.2,
fatten = NULL) + stat_summary(fun.y = mean, geom = "errorbar",
aes(ymax = ..y.., ymin = ..y..), width = 0.75,
size = 1, linetype = "solid") + scale_size_continuous(range = c(1,
3), guide = F) + theme_light() + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + labs(y = "IS ratio", color = "Replicon ratio",
size = "# CRLs", x = "Phylum") + ggtitle("A: Phyla") +
scale_color_continuous(high = "red", low = "blue",
guide = F)
# For performing statistical tests, remove phyla
# with less than 10 entries
df_aro_tax_filt = df_aro_tax %>% group_by(Phylum) %>%
filter(sum(occurrences) > 10)
# Is there normality in the data?
df_aro_tax_filt %>% group_by(Phylum) %>% filter(n() >
3) %>% shapiro_test(ratio)
## # A tibble: 5 x 4
## Phylum variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Actinobacteria ratio 0.631 2.64e-15
## 2 Bacteroidetes ratio 0.665 4.30e- 9
## 3 Firmicutes ratio 0.789 7.36e-17
## 4 Proteobacteria ratio 0.748 3.90e-34
## 5 Tenericutes ratio 0.765 5.28e- 2
# There is not normality in the data
# Perform Wilcoxon Rank Sum test (Mann-Whitney)
phylum_test = as.data.frame(compare_means(ratio ~ Phylum,
data = df_aro_tax_filt, method = "wilcox.test",
paired = F, p.adjust.method = "none")) %>% filter(p.adj <
0.05)
comparisons_phylum = as.data.frame(cbind(phylum_test$group1,
phylum_test$group2))
phylum_test
## .y. group1 group2 p p.adj p.format p.signif
## 1 ratio Actinobacteria Firmicutes 1.070055e-04 1.1e-04 0.00011 ***
## 2 ratio Actinobacteria Proteobacteria 6.656067e-08 6.7e-08 6.7e-08 ****
## 3 ratio Bacteroidetes Firmicutes 3.991445e-03 4.0e-03 0.00399 **
## 4 ratio Bacteroidetes Proteobacteria 1.703118e-04 1.7e-04 0.00017 ***
## 5 ratio Firmicutes Proteobacteria 1.557417e-02 1.6e-02 0.01557 *
## method
## 1 Wilcoxon
## 2 Wilcoxon
## 3 Wilcoxon
## 4 Wilcoxon
## 5 Wilcoxon
##### Subset bacteria
# Start with plotting firmicutes families
sub_bact <- by_aro_tax[which(by_aro_tax$Phylum == "Firmicutes"),
]
sub_bact <- sub_bact %>% group_by(Family, ARO)
sub_df_aro_tax = sub_bact %>% summarise(mean_dist = mean(Mean_dist),
zeroes = sum(Mean_dist == 0), ratio = sum(Mean_dist >
0)/(sum(Mean_dist == 0) + sum(Mean_dist > 0)),
occurrences = n(), replicon_type = sum(Replicon ==
"Plasmid")/(sum(Replicon == "Plasmid") + sum(Replicon ==
"Chr")))
p_firmi = ggplot(sub_df_aro_tax, aes(x = reorder(Family,
-ratio, FUN = mean), ratio)) + geom_jitter(aes(size = occurrences,
color = replicon_type), alpha = 0.6) + geom_boxplot(alpha = 0.2) +
stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y..,
ymin = ..y..), width = 0.75, size = 1, linetype = "solid") +
scale_size_continuous(range = c(1, 3)) + theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(y = "IS ratio", color = "Replicon ratio",
size = "# CRLs", x = "Family") + ggtitle("Firmicutes families") +
scale_color_continuous(high = "red", low = "blue",
guide = F)
p_firmi

# Plot proteobacterial phylum
sub_bact <- by_aro_tax[which(by_aro_tax$Phylum == "Proteobacteria"),
]
# Other phyla can be selected with e.g.: sub_bact
# <-
# by_aro_tax[which(by_aro_tax$Phylum=='Actinobacteria'),
# ] sub_bact <-
# by_aro_tax[which(by_aro_tax$Phylum=='Bacteroidetes'),
# ]
# What percentage of total unclustered loci does
# proteobacteria constitute?
nrow(sub_bact)/nrow(by_aro_tax) * 100
## [1] 88.17861
sub_bact <- sub_bact %>% group_by(Family, ARO)
sub_df_aro_tax = sub_bact %>% summarise(mean_dist = mean(Mean_dist),
zeroes = sum(Mean_dist == 0), ratio = sum(Mean_dist >
0)/(sum(Mean_dist == 0) + sum(Mean_dist > 0)),
occurrences = n(), replicon_type = sum(Replicon ==
"Plasmid")/(sum(Replicon == "Plasmid") + sum(Replicon ==
"Chr")))
# Remmove ca. 400 low-abundance proteobacteria
# families out of 1605 total for testing
sub_df_aro_tax_filt = sub_df_aro_tax %>% group_by(Family) %>%
filter(sum(occurrences) > 500)
# Test the IS ratio
family_test = as.data.frame(compare_means(ratio ~ Family,
data = sub_df_aro_tax_filt, method = "wilcox.test",
paired = F, p.adjust.method = "fdr")) %>% filter(p.adj <
0.05)
datatable(family_test)
# Test the replicon ratio
family_test_p = as.data.frame(compare_means(replicon_type ~
Family, data = sub_df_aro_tax_filt, method = "wilcox.test",
paired = F, p.adjust.method = "fdr")) %>% filter(p.adj <
0.05)
datatable(family_test_p)
p_proteo = ggplot(sub_df_aro_tax_filt, aes(x = reorder(Family,
-ratio, FUN = mean), ratio)) + geom_jitter(aes(size = occurrences,
color = replicon_type), alpha = 0.6) + geom_boxplot(alpha = 0.2,
fatten = NULL) + stat_summary(fun.y = mean, geom = "errorbar",
aes(ymax = ..y.., ymin = ..y..), width = 0.75,
size = 1, linetype = "solid") + scale_size_continuous(range = c(1,
3)) + theme_light() + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + labs(y = "IS ratio", color = "Replicon ratio",
size = "# CRLs", x = "Family") + ggtitle("B: Top 10 Proteobacteria families") +
scale_color_continuous(high = "red", low = "blue",
guide = F)
# p_proteo
replicon_proteo = ggplot(sub_df_aro_tax_filt, aes(x = reorder(Family,
-ratio, FUN = mean), replicon_type)) + geom_jitter(aes(size = occurrences,
color = ratio), alpha = 0.6) + geom_boxplot(alpha = 0.2,
fatten = NULL) + stat_summary(fun.y = mean, geom = "errorbar",
aes(ymax = ..y.., ymin = ..y..), width = 0.75,
size = 1, linetype = "solid") + scale_size_continuous(range = c(0.5,
6)) + theme_light() + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + labs(y = "Replicon ratio", color = "IS ratio",
size = "# CRLs", x = "Family") + ggtitle("Replicon ratio of top 10 Proteobacteria families") +
scale_color_continuous(high = "red", low = "blue")
replicon_proteo

# Enterobacteriaceae subset
sub_bact <- by_aro_tax[which(by_aro_tax$Family == "Enterobacteriaceae"),
]
# Other subsets selected with: sub_bact <-
# by_aro_tax[which(by_aro_tax$Family=='Burkholderiaceae'),
# ] sub_bact <-
# by_aro_tax[which(by_aro_tax$Family=='Bacillaceae'),
# ] sub_bact <-
# by_aro_tax[which(by_aro_tax$Family=='Enterococcaceae'),
# ] sub_bact <-
# by_aro_tax[which(by_aro_tax$Family=='Staphylococcaceae'),
# ]
sub_bact <- sub_bact %>% group_by(Genus, ARO)
sub_df_aro_tax = sub_bact %>% summarise(mean_dist = mean(Mean_dist),
zeroes = sum(Mean_dist == 0), ratio = sum(Mean_dist >
0)/(sum(Mean_dist == 0) + sum(Mean_dist > 0)),
occurrences = n(), replicon_type = sum(Replicon ==
"Plasmid")/(sum(Replicon == "Plasmid") + sum(Replicon ==
"Chr")))
sub_df_aro_tax_filt2 = sub_df_aro_tax %>% group_by(Genus) %>%
filter(sum(occurrences) > 100)
genus_test_p = as.data.frame(compare_means(replicon_type ~
Genus, data = sub_df_aro_tax_filt2, method = "wilcox.test",
paired = F, p.adjust.method = "fdr")) %>% filter(p.adj <
0.05) %>% select(2:3, 5, 7)
genus_test_p$Ratio = "Replicon"
for (i in 1:nrow(genus_test_p)) {
if (genus_test_p$p.adj[i] < 1e-04) {
genus_test_p$p.signif[i] = "****"
} else if (genus_test_p$p.adj[i] < 0.001) {
genus_test_p$p.signif[i] = "***"
} else if (genus_test_p$p.adj[i] < 0.01) {
genus_test_p$p.signif[i] = "**"
} else if (genus_test_p$p.adj[i] < 0.05) {
genus_test_p$p.signif[i] = "*"
} else if (genus_test_p$p.adj[i] > 0.05) {
genus_test_p$p.signif[i] = "ns"
}
}
genus_test = as.data.frame(compare_means(ratio ~ Genus,
data = sub_df_aro_tax_filt2, method = "wilcox.test",
paired = F, p.adjust.method = "fdr")) %>% filter(p.adj <
0.05) %>% select(2:3, 5, 7)
genus_test$Ratio = "IS"
for (i in 1:nrow(genus_test)) {
if (genus_test$p.adj[i] < 1e-04) {
genus_test$p.signif[i] = "****"
} else if (genus_test$p.adj[i] < 0.001) {
genus_test$p.signif[i] = "***"
} else if (genus_test$p.adj[i] < 0.01) {
genus_test$p.signif[i] = "**"
} else if (genus_test$p.adj[i] < 0.05) {
genus_test$p.signif[i] = "*"
} else if (genus_test$p.adj[i] > 0.05) {
genus_test$p.signif[i] = "ns"
}
}
# Print a table of the pairwise comparisons for
# Enterobacteriaceae genera for IS and replicon
# ratios
genus_test_both = full_join(genus_test, genus_test_p,
by = c("group1", "group2")) %>% select(1:4, 6:7)
colnames(genus_test_both) = c("Genus 1", "Genus 2",
"IS p", "IS sign.", "Replicon p", "Replicon sign.")
print("Mann-Whitney pairwise tests of IS and replicon ratios for Enterobacteriaceae genera")
## [1] "Mann-Whitney pairwise tests of IS and replicon ratios for Enterobacteriaceae genera"
# Print table with kable package
# kable(genus_test_both,longtable = T) %>%
# kable_styling(bootstrap_options = c('striped',
# 'hover', 'condensed'), latex_options =
# c('hold_position', 'repeat_header')) Print table
# with DT package
datatable(genus_test_both)
write.csv2(genus_test_both, "genus_test_both.csv")
p_genera1 = ggplot(sub_df_aro_tax, aes(x = reorder(Genus,
-ratio, FUN = mean), ratio)) + geom_jitter(aes(size = occurrences,
color = replicon_type), alpha = 0.6) + geom_boxplot(alpha = 0.2,
fatten = NULL) + stat_summary(fun.y = mean, geom = "errorbar",
aes(ymax = ..y.., ymin = ..y..), width = 0.75,
size = 1, linetype = "solid") + scale_size_continuous(range = c(1,
3)) + theme_light() + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + labs(y = "IS ratio", color = "Replicon ratio",
size = "# CRLs", x = "Genus") + ggtitle(expression("C: Top 15 " ~
italic(Enterobacteriaceae) ~ " genera")) + scale_color_continuous(high = "red",
low = "blue", guide = F)
# p_genera1
p_genera2 = ggplot(sub_df_aro_tax_filt2, aes(x = reorder(Genus,
-ratio, FUN = mean), ratio)) + geom_jitter(aes(size = occurrences,
color = replicon_type), alpha = 0.6) + scale_size_continuous(guide = F) +
scale_color_continuous(high = "red", low = "blue") +
labs(color = "Replicon ratio") + theme(legend.position = "bottom")
legend <- cowplot::get_legend(p_genera2)
grid.arrange(p_phylum, p_proteo, p_genera1, legend,
layout_matrix = rbind(c(1, 2), c(3), c(4)), heights = c(6,
6, 1))

Collect different MOB parameters and derive the ARG-MOB scale from these
#What we need and from where
#IS_ratio Main analyses
#Replicon_ratio Main analyses
#Int_ratio Integron finder. Get the total number of occurrences from dist_data and then number of integron occurrences from df_int
#Phylo spread Main analyses/taxonomy data
#A new data frame is required with all this data. Rows are individual CRLs and values
#Import IS distance and replicon data again. Same data as above, but reimported to ensure that formatting is for analyses
dist_data=read.csv("ARG_IS_distances_mech.txt",sep = "\t",header=F)
colnames(dist_data) = c("Region","ARO","Updist","Downdist","IS_Up","IS_Down","Replicon",
"Sim_cutoff","Mean_dist","Cluster_size","Mechanism","Submechanism")
#dist_data_pass_sim = dist_data[which(dist_data$Mean_dist < dist_data$Sim_cutoff),]
#dist_data = dist_data_pass_sim
dist_data2 = dist_data
dist_data2$ARO = gsub("ARO:","",dist_data2$ARO)
all_aros = unique(as.vector(dist_data2$ARO))
df <- data.frame(ARO=character(),
Occurrences=numeric(),
Ratio=numeric(),
Mean_distance=numeric(),
SD_distance=numeric(),
Mechanism=character(),
Submechanism=character(),
Zero_IS=numeric(),
Replicon=character())
for (ARO in all_aros){
temp <- dist_data2[which(dist_data2$ARO==ARO), ]
Zero_IS = nrow(temp[which(temp$Mean_dist == 0),])
number_nonzero = nrow(temp[which(temp$Mean_dist > 0),])
Ratio = number_nonzero/nrow(temp)
Occurrences = nrow(temp)
temp_nonzero = temp[which(temp$Mean_dist > 0),]
Mean_distance = mean(temp_nonzero$Mean_dist)
SD_distance = sd(temp_nonzero$Mean_dist)
Mechanism = unique(as.vector(temp$Mechanism))
IS_Up = unique(as.vector(temp$IS_Up))
IS_Down = unique(as.vector(temp$IS_Down))
Submechanism = unique(as.vector(temp$Submechanism))
p_rep = nrow(temp[which(temp$Replicon == "Plasmid"),])
c_rep = nrow(temp[which(temp$Mean_dist == "Chr"),])
Replicon = p_rep/nrow(temp)
temp2 = as.data.frame(cbind(ARO,Occurrences,Ratio,Mean_distance,SD_distance,Mechanism,Submechanism,Zero_IS,Replicon))
df = rbind(df,temp2)
}
df$Occurrences = as.numeric(as.character(df$Occurrences))
df$Mean_distance = as.numeric(as.character(df$Mean_distance))
df$SD_distance = as.numeric(as.character(df$SD_distance))
df$Ratio = as.numeric(as.character(df$Ratio))
df$Zero_IS = as.numeric(as.character(df$Zero_IS))
df$Replicon = as.numeric(as.character(df$Replicon))
#Import data on integrons
arg_int = read.csv("ARG_integrons_ISProx.txt", header = F, sep = "\t")
colnames(arg_int) = c("Region","ARO","Updist","Downdist","IS_up","IS_down",
"Replicon","Sim_cutoff","Mean_dist","Cluster_size","Mechanism","Submechanism")
arg_int$ARO = gsub("ARO:","",arg_int$ARO)
arg_int$Cluster_size = gsub("size_","",arg_int$Cluster_size)
int_all_aros = unique(as.vector(arg_int$ARO))
df_int <- data.frame(ARO=character(),
Occurrences=numeric(),
Ratio=numeric(),
Mean_distance=numeric(),
SD_distance=numeric(),
Mechanism=character(),
Submechanism=character(),
Zero_IS=numeric(),
Replicon=character(),
Total_occurrences=numeric())
#Summarise integron data per ARO
for (ARO in int_all_aros){
temp <- arg_int[which(arg_int$ARO==ARO), ]
Zero_IS = nrow(temp[which(temp$Mean_dist == 0),])
number_nonzero = nrow(temp[which(temp$Mean_dist > 0),])
Ratio = number_nonzero/nrow(temp)
Occurrences = nrow(temp)
temp_nonzero = temp[which(temp$Mean_dist > 0),]
Mean_distance = mean(temp_nonzero$Mean_dist)
SD_distance = sd(temp_nonzero$Mean_dist)
Mechanism = unique(as.vector(temp$Mechanism))
Submechanism = unique(as.vector(temp$Submechanism))
p_rep = nrow(temp[which(temp$Replicon == "Plasmid"),])
c_rep = nrow(temp[which(temp$Mean_dist == "Chr"),])
Replicon = p_rep/nrow(temp)
Total_occurrences = sum(as.numeric(as.character(temp$Cluster_size)))
temp2 = as.data.frame(cbind(ARO,Occurrences,Ratio,Mean_distance,SD_distance,Mechanism,Submechanism,Zero_IS,Replicon,Total_occurrences))
df_int = rbind(df_int,temp2)
}
df_int$Occurrences = as.numeric(as.character(df_int$Occurrences))
df_int$Mean_distance = as.numeric(as.character(df_int$Mean_distance))
df_int$SD_distance = as.numeric(as.character(df_int$SD_distance))
df_int$Ratio = as.numeric(as.character(df_int$Ratio))
df_int$Zero_IS = as.numeric(as.character(df_int$Zero_IS))
df_int$Replicon = as.numeric(as.character(df_int$Replicon))
df_int$Mechanism = gsub("antibiotic_efflux","Antibiotic efflux",df_int$Mechanism)
df_int$Mechanism = gsub("antibiotic_target_replacement","Antibiotic target replacement",df_int$Mechanism)
df_int$Mechanism = gsub("antibiotic_target_alteration","Antibiotic target alteration",df_int$Mechanism)
df_int$Mechanism = gsub("antibiotic_inactivation","Antibiotic inactivation",df_int$Mechanism)
df_int$Mechanism = gsub("reduced_permeability_to_antibiotic","Reduced permeability to antibiotic",df_int$Mechanism)
df_int$Mechanism = gsub("antibiotic_target_protection","Antibiotic target protection",df_int$Mechanism)
df_int$Submechanism = gsub("_"," ",df_int$Submechanism)
#Plot integron data
df_int2 = df_int %>%
select(Mechanism,Submechanism,Occurrences,Total_occurrences) %>%
melt() %>%
group_by(Mechanism,Submechanism,variable) %>%
summarise(Accum = sum(value))
#Make nicer names
df_int2$variable = gsub("Occurrences","CRL occurrences",df_int2$variable)
df_int2$variable = gsub("Total_occurrences","Total occurrences",df_int2$variable)
df_int2$Submechanism = gsub("beta-lactamase","b-l",df_int2$Submechanism)
df_int2$Submechanism = gsub("beta-lactams","b-lactams",df_int2$Submechanism)
df_int2$Submechanism = gsub("antibiotic","Ab.",df_int2$Submechanism)
df_int2$Submechanism = gsub("Antibiotic","Ab.",df_int2$Submechanism)
df_int2$Submechanism = gsub("chloramphenicol acetyltransferase","",df_int2$Submechanism)
df_int2$Submechanism = gsub("rifampin ADP-ribosyltransferase","",df_int2$Submechanism)
df_int2$Submechanism = gsub("transferase","trans.",df_int2$Submechanism)
df_int2$Submechanism = gsub("trimethoprim","trim.",df_int2$Submechanism)
df_int2$Submechanism = gsub("reductase","red.",df_int2$Submechanism)
df_int2$Submechanism = gsub("major facilitator superfamily \\(","",df_int2$Submechanism)
df_int2$Submechanism = gsub("small multidrug resistance \\(","",df_int2$Submechanism)
df_int2$Submechanism = gsub("resistance-nodulation-cell division \\(","",df_int2$Submechanism)
df_int2$Submechanism = gsub("ATP-binding cassette \\(","",df_int2$Submechanism)
df_int2$Submechanism = gsub("\\) ab. efflux pump","",df_int2$Submechanism)
df_int2$Submechanism = gsub("resistant","res.",df_int2$Submechanism)
df_int2$Submechanism = gsub("cassette ribosomal protection","cas. ribo. prot.",df_int2$Submechanism)
df_int2$Submechanism = gsub("protein","",df_int2$Submechanism)
df_int2$Submechanism = gsub("rifampicin","rif.",df_int2$Submechanism)
df_int2$Submechanism = gsub("permeability","perm.",df_int2$Submechanism)
df_int2$Submechanism = gsub("General Bacterial ","",df_int2$Submechanism)
df_int2$Mechanism = gsub("Antibiotic","Ab.",df_int2$Mechanism)
df_int2$Mechanism = gsub("antibiotic","Ab.",df_int2$Mechanism)
df_int2$Mechanism = gsub("permeability","perm.",df_int2$Mechanism)
df_int3 = df_int2 %>%
group_by(Mechanism, variable) %>%
select(Mechanism,variable,Accum) %>%
summarise(
Accum2 = sum(Accum)
)
df_int3 = df_int3 %>%
filter(variable == "CRL occurrences")
df_int3$Mechanism = gsub("_"," ",df_int3$Mechanism)
p_int = ggplot(df_int3, aes(x=reorder(Mechanism,desc(Accum2), FUN = sum), y = Accum2)) +
geom_bar(stat = "identity", position = "dodge") +
theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust =1,size=10)) +
labs(x = "", fill = "", y = "Count") +
ggtitle("A: Major mechanisms in integrons")
df_int2 = df_int2 %>%
filter(variable == "CRL occurrences") %>%
filter(Accum > 10)
colours = c("Ab. efflux" = "#A6CEE3", "Ab. inactivation" = "#1F78B4", "Ab. target replacement" = "#FB9A99", "Ab. target protection" = "#33A02C")
p_int_sub = ggplot(df_int2, aes(x=reorder(Submechanism,desc(Accum), FUN = sum),Accum, color = reorder(Mechanism,desc(Accum), FUN = sum))) +
geom_point(size = 3) +
theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust =1, size = 10)) +
labs(x = "", fill = "", y = "Count", color = "Mechanism") +
ggtitle("B: Submechanisms in integrons (count > 10)") +
scale_color_manual(values = colours,
labels = c("Ab. inactivation (AI)","Ab. target replacement (ATR)",
"Ab. efflux (AE)","Ab. target protection (ATP)"))
grid.arrange(p_int,p_int_sub, widths = c(1,2),ncol =2)

#Get a table of relative abundance of the top submechanisms in integrons
#df_int2 = df_int %>%
# select(Mechanism,Submechanism,Occurrences,Total_occurrences) %>%
# melt() %>%
# group_by(Mechanism,Submechanism,variable) %>%
# summarise(
# Accum = sum(value))
#df_int4 = df_int2 %>%
# filter(variable == "Occurrences")
#df_int4$SubFreq = df_int4$Accum/sum(df_int4$Accum) *100
#df_int4_top = df_int4 %>%
# group_by(Submechanism) %>%
# arrange(desc(SubFreq)) %>%
# select(Mechanism,Submechanism,Accum,SubFreq) %>%
# `colnames<-`(c("Mechanism", "Submechanism", "CRL occurrences","Percentage of occurrences"))
#Get the submech counts from main analyses for comparison
df_int_join = df_int %>% left_join(df, by = "ARO") %>%
select(ARO,Mechanism.x,Submechanism.x,Occurrences.x,Occurrences.y) %>%
arrange(desc(Occurrences.x)) %>%
`colnames<-`(c("ARO","Mechanism", "Submechanism", "Integron occurrences","Total occurrences"))
Gene = c("aadA","aadA2","AAC(6')-Ib-cr","dfrA12","OXA-1",
"dfrA14","aadA5","AAC(6')-Ib10","arr-3","sul1","dfrA1","qacH",
"catB3","cmlA1","AAC(6')-Ib9","ANT(3'')-IIa","OXA-9",
"ANT(2'')-Ia","AAC(6')-Ib7","cmlA5")
df_int_join = cbind(Gene,head(df_int_join,20))
df_int_join$'Percentage of integron occurrences' = df_int_join$`Integron occurrences`/sum(df_int_join$`Integron occurrences`) *100
df_int_join$'Percentage of total occurrences' = df_int_join$`Integron occurrences`/df_int_join$`Total occurrences` *100
datatable(df_int_join,)
#Table with kable
#kable(head(df_int_join,20)) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>% column_spec(1,italic = TRUE)
#kable(head(df_int_join,20)) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>% column_spec(1,italic = TRUE) %>%
# as_image(file = "integron_top20_submechs.png")
#Export csv file of integron results (top20)
#write.csv2(df_int_join,file = "integron_top20_submechs.csv",sep = "\t",dec = ".")
#Merge ratios with MOB parameters for creating the ARG-MOB scale
all_aros = unique(as.vector(df$ARO))
dist_int_join = full_join(df, df_int, by = "ARO") %>%
select(1,2,3,6,7,9,10,11,17)
colnames(dist_int_join) = c("ARO","Total_occurrences","IS_ratio",
"Mechanism","Submechanism","Replicon_ratio",
"Int_occurrences","Int_IS_ratio","Int_replicon_ratio")
#Merge integron data with IS distance and replicon data
dist_int_join$Int_ratio = dist_int_join$Int_occurrences/dist_int_join$Total_occurrences
#Import taxonomy data
aro_tax = read.csv("aro_tax.Rdat",sep="\t",header =F)
colnames(aro_tax) = c("Region","ARO","Up_dist","Down_dist","IS_up","IS_down",
"Replicon","Sim_cutoff","Mean_dist","Cluster_size","Mechanism","Submechanism","Acc.","Superkingdom",
"Kingdom","Phylum","Class","Order","Family","Genus")
aro_tax$ARO = gsub("ARO:","",aro_tax$ARO)
#If superkingdom is empty, remove the line
#aro_tax <- aro_tax[-which(aro_tax$Superkingdom == ""), ]
#Other tax parameters
df_aro_tax = as.data.frame(aro_tax %>%
group_by(ARO) %>%
summarise(
unique_genera = n_distinct(Family),
n = n()))
df_aro_tax$Phylo_spread = 1-(1/df_aro_tax$unique_genera)
tax_all_aros = unique(as.vector(aro_tax$ARO))
df_tax <- data.frame(ARO=character(),
Diversity=numeric())
#If the below loop fails, it helps to restart R for some reason.
#Calculate Simpson diversity index per ARO
for (ARO in tax_all_aros){
temp <- aro_tax[which(aro_tax$ARO==ARO), ]
tax_count = as.data.frame(temp %>% group_by(Genus) %>% summarize(count=n()))
rownames(tax_count) = tax_count$Genus
unique_genera = n_distinct(tax_count$Genus)
tax_count = subset(tax_count, select = count)
taxcount = nrow(temp)
Simpson_index = as.numeric(diversity(tax_count, index = "simpson"))
Simpson2 = 1-(1/as.numeric(as.character(Simpson_index)))
temp2 = as.data.frame(cbind(ARO,unique_genera,Simpson_index,taxcount,Simpson2))
df_tax = rbind(df_tax,temp2)
}
df_tax$unique_genera = as.numeric(as.character(df_tax$unique_genera))
df_tax$Simpson_index = as.numeric(as.character(df_tax$Simpson_index))
df_tax$taxcount = as.numeric(as.character(df_tax$taxcount))
df_tax$Simpson2 = as.numeric(as.character(df_tax$Simpson2))
df_tax = unique(df_tax)
#Merge with other data
dist_int_tax_join = full_join(dist_int_join, df_tax, by = "ARO") %>%
select(1,2,3,4,5,6,7,8,9,10,11,12,14)
dist_int_tax_join[is.na(dist_int_tax_join)] <- 0
dist_int_tax_join$ARGMOB = (dist_int_tax_join$IS_ratio +
dist_int_tax_join$Replicon_ratio +
dist_int_tax_join$Int_ratio +
dist_int_tax_join$Simpson_index)/4
#Rename mechanisms
dist_int_tax_join$Mechanism = gsub("antibiotic_efflux","Antibiotic efflux",dist_int_tax_join$Mechanism)
dist_int_tax_join$Mechanism = gsub("antibiotic_target_replacement","Antibiotic target replacement",dist_int_tax_join$Mechanism)
dist_int_tax_join$Mechanism = gsub("antibiotic_target_alteration","Antibiotic target alteration",dist_int_tax_join$Mechanism)
dist_int_tax_join$Mechanism = gsub("antibiotic_inactivation","Antibiotic inactivation",dist_int_tax_join$Mechanism)
dist_int_tax_join$Mechanism = gsub("reduced_permeability_to_antibiotic","Reduced permeability to antibiotic",dist_int_tax_join$Mechanism)
dist_int_tax_join$Mechanism = gsub("antibiotic_target_protection","Antibiotic target protection",dist_int_tax_join$Mechanism)
dist_int_tax_join_filt = dist_int_tax_join[which(dist_int_tax_join$Total_occurrences>10),]
dist_int_tax_join_filt = dist_int_tax_join %>%
filter(Total_occurrences> 0) %>%
filter(Mechanism != "Antibiotic efflux;Reduced permeability to antibiotic") %>%
filter(Mechanism != "Antibiotic target alteration;Antibiotic efflux") %>%
filter(Mechanism != "Reduced permeability to antibiotic") %>%
filter(Mechanism != "Antibiotic target alteration;Antibiotic target replacement")
#Test for normality per mechanism and equal variance
qqp = ggqqplot(dist_int_tax_join_filt, x = "ARGMOB",color = "Mechanism")
ggpar(qqp, legend = "right")

dist_int_tax_join_filt %>%
select(Mechanism, ARGMOB) %>% # Remove unnecessary columns
gather(key = "Mechanism", value = "ARGMOB") %>%
group_by(Mechanism) %>%
filter(n() > 3L) %>%
do(broom::tidy(shapiro.test(.$ARGMOB))) %>%
ungroup() %>%
select(Mechanism, W = statistic, 'p-value' = p.value)
## # A tibble: 5 x 3
## Mechanism W `p-value`
## <chr> <dbl> <dbl>
## 1 Antibiotic efflux 0.788 9.54e-17
## 2 Antibiotic inactivation 0.831 7.36e-27
## 3 Antibiotic target alteration 0.862 1.50e- 9
## 4 Antibiotic target protection 0.913 5.22e- 4
## 5 Antibiotic target replacement 0.950 8.38e- 2
#Fails. Data is not normally distributed
#Test for homogeneity of variance #Fails. We cannot do T-test but instead Mann-Whitney test.
bartlett.test(ARGMOB ~ Mechanism, data = dist_int_tax_join_filt)
##
## Bartlett test of homogeneity of variances
##
## data: ARGMOB by Mechanism
## Bartlett's K-squared = 63.416, df = 4, p-value = 5.547e-13
#Make a function to test the different ratios for plotting
mw_custom_fun = function() {
mech_means <- dist_int_tax_join_filt %>%
group_by(Mechanism) %>%
summarise(mean_ARGMOB = mean(ARGMOB))
colnames(mech_means) = c("group1","mech_mean_ARGMOB")
comp_means2 <- comp_means %>%
full_join(mech_means, by = "group1") %>%
drop_na(p) %>%
arrange(desc(mech_mean_ARGMOB))
colnames(mech_means) = c("group2","mech_mean_ARGMOB")
comp_means2 <- comp_means2 %>%
full_join(mech_means, by = "group2") %>%
drop_na(p) %>%
arrange(desc(mech_mean_ARGMOB.x),desc(mech_mean_ARGMOB.y))
y_pos = seq(1.1, 1+nrow(comp_means2)*0.16, 0.16)
comp_means2 = cbind(comp_means2,y_pos)
comp_means2 = comp_means2 %>%
mutate(p.signif = case_when(
p.adj < 0.0001 ~ "****",
p.adj < 0.001 ~ "***",
p.adj < 0.01 ~ "**",
p.adj < 0.05 ~ "*",
p.adj > 0.05 ~ "ns"
))
}
comp_means = compare_means(ARGMOB ~ Mechanism, data = dist_int_tax_join_filt,
method = "wilcox.test",paired = F,
p.adjust.method = "fdr") %>%
filter(p.adj < 0.05)
comp_means = mw_custom_fun()
datatable(comp_means)
#kable(comp_means) %>%
# kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
comp_means = compare_means(IS_ratio ~ Mechanism, data = dist_int_tax_join_filt,
method = "wilcox.test",paired = F,
p.adjust.method = "fdr") %>%
filter(p.adj < 0.05)
comp_means = mw_custom_fun()
p1 = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), ARGMOB)) +
geom_boxplot(outlier.shape =NA) +
theme_light() +
theme(axis.text.x=element_blank(),
legend.position = "none",
axis.title.x=element_blank()) +
geom_point(aes(color = IS_ratio, size = Total_occurrences),
position = position_jitterdodge(seed = 1,jitter.width = 0.7)) +
scale_color_continuous(high = "red",low = "blue") +
labs(title = "IS ratio", y = "ARG-MOB") +
ggtitle("IS ratio") +
scale_size_continuous(range = c(0.5,3)) +
stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
width = 0.75, size = 1, linetype = "dashed") +
scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00),
breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,1)) +
annotate("text", x = 1:5, y = 1, label = c("bold(a)","bold(b)","bold(bc)","bold(c)","bold(c)"),
parse = T, size = 3)
#p1
p1_dens = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), IS_ratio)) +
geom_boxplot(outlier.alpha = 0.5,) +
theme_light() +
ggsignif::geom_signif(data=comp_means, aes(xmin=group1, xmax=group2,
annotations=p.signif, y_position=y_pos), manual=TRUE, tip_length = 0,vjust=0.5) +
scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00),
breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,max(comp_means$y_pos+0.2))) +
theme(axis.text.x=element_blank(),
legend.position = "none",
axis.title.x=element_blank()) +
labs(y = "IS ratio")
#Replicon ratio
comp_means = compare_means(Replicon_ratio ~ Mechanism, data = dist_int_tax_join_filt,
method = "wilcox.test",paired = F,
p.adjust.method = "fdr") %>%
filter(p.adj < 0.05)
comp_means = mw_custom_fun()
p2 = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), ARGMOB)) +
geom_boxplot(outlier.shape =NA) +
theme_light() +
theme(axis.text.x=element_blank(),
legend.position = "none",
axis.title.x=element_blank()) +
geom_point(aes(color = Replicon_ratio, size = Total_occurrences),
position = position_jitterdodge(seed = 1,jitter.width = 0.7)) +
scale_color_continuous(high = "red",low = "blue") +
labs(title = "Replicon ratio", y = "ARG-MOB") +
scale_size_continuous(range = c(0.5,3)) +
stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
width = 0.75, size = 1, linetype = "dashed") +
scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00),
breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,1)) +
annotate("text", x = 1:5, y = 1, label = c("bold(a)","bold(b)","bold(bc)","bold(c)","bold(c)"),
parse = T, size = 3)
p2_dens = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), Replicon_ratio)) +
geom_boxplot(outlier.alpha = 0.5,) +
theme_light() +
ggsignif::geom_signif(data=comp_means, aes(xmin=group1, xmax=group2,
annotations=p.signif, y_position=y_pos), manual=TRUE, tip_length = 0,vjust=0.5) +
scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00),
breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,max(comp_means$y_pos+0.2))) +
theme(axis.text.x=element_blank(),
legend.position = "none",
axis.title.x=element_blank()) +
labs(y = "Replicon ratio")
#Integron ratio
comp_means = compare_means(Int_ratio ~ Mechanism, data = dist_int_tax_join_filt,
method = "wilcox.test",paired = F,
p.adjust.method = "fdr") %>%
filter(p.adj < 0.05)
comp_means = mw_custom_fun()
p3 = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), ARGMOB)) +
geom_boxplot(outlier.shape =NA) +
theme_light() +
theme(#axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none",
axis.text.x=element_blank(),
axis.title.x=element_blank()) +
geom_point(aes(color = Int_ratio, size = Total_occurrences),
position = position_jitterdodge(seed = 1,jitter.width = 0.7)) +
scale_color_continuous(high = "red",low = "blue") +
labs(title = "Integron ratio", y = "ARG-MOB")+
scale_size_continuous(range = c(0.5,3)) +
stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
width = 0.75, size = 1, linetype = "dashed") +
scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00),
breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,1)) +
annotate("text", x = 1:5, y = 1, label = c("bold(a)","bold(b)","bold(bc)","bold(c)","bold(c)"),
parse = T, size = 3)
p3_dens = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), Int_ratio)) +
geom_boxplot(outlier.alpha = 0.5,) +
theme_light() +
ggsignif::geom_signif(data=comp_means, aes(xmin=group1, xmax=group2,
annotations=p.signif, y_position=y_pos), manual=TRUE, tip_length = 0,vjust=0.5) +
scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00),
breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,max(comp_means$y_pos+0.2))) +
theme(axis.text.x=element_blank(),
legend.position = "none",
axis.title.x=element_blank()) +
labs(y = "Integron ratio")
#Simpson diversity index
comp_means = compare_means(Simpson_index ~ Mechanism, data = dist_int_tax_join_filt,
method = "wilcox.test",paired = F,
p.adjust.method = "fdr") %>%
filter(p.adj < 0.05)
comp_means = mw_custom_fun()
p4 = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), ARGMOB)) +
geom_boxplot(outlier.shape = NA) +
theme_light() +
theme(axis.text.x = element_text(angle = 60, hjust = 1, size=12),
legend.position = "none",
axis.title.x=element_blank()) +
geom_point(aes(color = Simpson_index, size = Total_occurrences),
position = position_jitterdodge(seed = 1,jitter.width = 0.7)) +
scale_color_continuous(high = "red",low = "blue") +
labs(title = "Simpson index", y = "ARG-MOB")+
scale_size_continuous(range = c(0.5,3)) +
stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
width = 0.75, size = 1, linetype = "dashed") +
scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00),
breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,1)) +
scale_x_discrete(labels=c("ATR","ATP","AI","ATA","AE")) +
annotate("text", x = 1:5, y = 1, label = c("bold(a)","bold(b)","bold(bc)","bold(c)","bold(c)"),
parse = T, size = 3)
p4_dens = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), Simpson_index)) +
geom_boxplot(outlier.alpha = 0.5) +
theme_light() +
ggsignif::geom_signif(data=comp_means, aes(xmin=group1, xmax=group2,
annotations=p.signif, y_position=y_pos), manual=TRUE, tip_length = 0,vjust=0.5) +
scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00),
breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,max(comp_means$y_pos+0.2))) +
theme(legend.position = "none",
axis.title.x=element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1, size=12)) +
scale_x_discrete(labels=c("ATR","ATP","AI","ATA","AE")) +
labs(y = "Simpson index")
p5 = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), ARGMOB)) +
geom_boxplot(outlier.shape =NA) +
theme_light() +
theme(axis.text.x = element_text(angle = 60, hjust = 1, size=12),
legend.position = "none",
axis.title.y=element_blank(),
axis.title.x=element_blank()) +
geom_point(aes(color = Exp_Ratio, size = Total_occurrences),
position = position_jitterdodge(seed = 1,jitter.width = 0.7)) +
scale_color_continuous(high = "red",low = "blue") +
labs(title = "High mob")+
scale_size_continuous(range = c(0.5,3)) +
scale_y_continuous(labels= c(0,00.25,0.50,0.75,1.00),
breaks= c(0,00.25,0.50,0.75,1.00), limits = c(0,1))
# Get the legend only
px = ggplot(dist_int_tax_join_filt, aes(x = reorder(Mechanism, -ARGMOB, FUN = mean,), ARGMOB)) +
geom_boxplot(outlier.shape =NA) +
theme_light() +
geom_point(aes(color = Simpson_index, size = Total_occurrences),
position = position_jitterdodge(seed = 1,jitter.width = 0.7)) +
scale_color_continuous(high = "red",low = "blue") +
labs(title = "Simpson index", color = "Ratios",size = "Number of\nCRLs")
legend <- cowplot::get_legend(px)
# Plots of figure a,b titles
p_a = ggplot() + ggtitle("a") + theme(plot.title = element_text(face = "bold"))
p_b = ggplot() + ggtitle("b") + theme(plot.title = element_text(face = "bold"))
p_a_title = cowplot::get_title(p_a)
p_b_title = cowplot::get_title(p_b)
grid.arrange(legend, p_a_title, p1, p2, p3, p4, p_b_title,
p1_dens, p2_dens, p3_dens, p4_dens, layout_matrix = cbind(c(1),
c(2, 3, 4, 5, 6), c(7, 8, 9, 10, 11)), widths = c(1,
5, 5), heights = c(0.2, 1, 1, 1, 1))

print("Replacement (ATR)\nProtection (ATP)\nInactivation (AI)\nAlteration (ATA)\nEfflux (AE)")
## [1] "Replacement (ATR)\nProtection (ATP)\nInactivation (AI)\nAlteration (ATA)\nEfflux (AE)"
# Make interactive plot ggplotly(p1)
mytext = paste("ARO = ", dist_int_tax_join_filt$ARO,
"\n", "ARG-MOB = ", dist_int_tax_join_filt$ARGMOB,
"\n", "Mechanism = ", dist_int_tax_join_filt$Mechanism,
"\n", "Submech, = ", dist_int_tax_join_filt$Submechanism,
"\n", "IS ratio = ", dist_int_tax_join_filt$IS_ratio,
"\n", "Replicon ratio = ", dist_int_tax_join_filt$Replicon_ratio,
"\n", "Integron ratio = ", dist_int_tax_join_filt$Int_ratio,
"\n", "Simpson index = ", dist_int_tax_join_filt$Simpson_index,
"\n")
pp = plotly_build(p1)
style(pp, hovertext = mytext, hoverinfo = "text")
Create Pearson correlation plots between ARG-MOB parameter
Pairwise Pearson correlation charts are made for all mechanisms combined and then for each mechanism independently.
A heatmap of ARG-MOB parameters is created using the most abundant CRLs (min 20 regions)
for_corr = cbind(dist_int_tax_join$IS_ratio, dist_int_tax_join$Replicon_ratio,
dist_int_tax_join$Int_ratio, dist_int_tax_join$Simpson_index)
colnames(for_corr) = c("IS ratio", "Replicon ratio",
"Int ratio", "Simpson index")
chart.Correlation(for_corr, histogram = TRUE, pch = 19)

# Do correlation chart per mechanism group (only
# major) antibiotic_efflux
# antibiotic_target_replacement
# antibiotic_target_alteration
# antibiotic_inactivation
# antibiotic_target_protection
corr_fun = function(input_df, mech, plotx) {
sub_corr = input_df[which(input_df$Mechanism ==
mech), ]
sub_corr = cbind(sub_corr$IS_ratio, sub_corr$Replicon_ratio,
sub_corr$Int_ratio, sub_corr$Simpson_index)
colnames(sub_corr) = c("IS ratio", "Replicon ratio",
"Int ratio", "Simpson index")
plotx <- chart.Correlation(sub_corr, histogram = TRUE,
pch = 19)
}
# Antibiotic efflux
corr_fun(dist_int_tax_join, "Antibiotic efflux", p1)

# Antibiotic target replacement
corr_fun(dist_int_tax_join, "Antibiotic target replacement",
p2)

# Antibiotic target alteration
corr_fun(dist_int_tax_join, "Antibiotic target alteration",
p3)

# Antibiotic inactivation
corr_fun(dist_int_tax_join, "Antibiotic inactivation",
p4)

# Antibiotic target protection
corr_fun(dist_int_tax_join, "Antibiotic target protection",
p5)

# Make heatmap. Filter out CRLs that occur less
# than 20 times, in order to avoid a cluttered
# heatmap
test = dist_int_tax_join %>% filter(Total_occurrences >
20) %>% filter(Mechanism != "Antibiotic efflux;Reduced permeability to antibiotic") %>%
filter(Mechanism != "Antibiotic target alteration;Antibiotic efflux") %>%
filter(Mechanism != "Reduced permeability to antibiotic") %>%
filter(Mechanism != "Antibiotic target alteration;Antibiotic target replacement")
for_heat = cbind(test$IS_ratio, test$Replicon_ratio,
test$Int_ratio, test$Simpson_index)
colnames(for_heat) = c("IS ratio", "Replicon ratio",
"Integron ratio", "Simpson index")
for_heatmap = t(as.data.frame(for_heat))
mechanisms_ARGMOB = data.frame(Mechanism = test$Mechanism,
`ARG-MOB` = as.numeric(test$ARGMOB))
colnames(mechanisms_ARGMOB) = c("Mechanism", "ARG-MOB")
colours = c(`All mechanisms` = "black", `Antibiotic efflux` = "#A6CEE3",
`Antibiotic inactivation` = "#1F78B4", `Antibiotic target alteration` = "#B2DF8A",
`Antibiotic target protection` = "#33A02C", `Antibiotic target replacement` = "#FB9A99")
col = list(Mechanism = c(`Antibiotic efflux` = "#A6CEE3",
`Antibiotic target replacement` = "#FB9A99", `Antibiotic target alteration` = "#B2DF8A",
`Antibiotic inactivation` = "#1F78B4", `Antibiotic target protection` = "#33A02C"),
`ARG-MOB` = circlize::colorRamp2(c(0, 1), c("white",
"red")))
ha = HeatmapAnnotation(df = mechanisms_ARGMOB, col = col)
pheat = Heatmap(for_heatmap, top_annotation = ha, cluster_rows = F,
name = "Ratio", heatmap_height = 0.1, column_dend_height = unit(30,
"mm"), heatmap_legend_param = list(ncol = 1,
legend_direction = "vertical"), col = circlize::colorRamp2(c(0,
1), c("blue", "red")))
pheat

Investigate how ARG-MOB parameters overlap using barplots.
A table of the top 10 ARG-MOB AROs per major mechanism is also produced. This table is not included in main text or supplemental information.
# Investigate how many CRLs are on plasmids that
# are also in integrons and associated with IS
# elements... And other combinations of these.
# Make a dummy variable in the arg_int dataset
# indicating that the ARG is in an integron
arg_int2 = arg_int
arg_int2$Integron = "Integron"
# Join with dist_data2
dist_data2_int = full_join(dist_data2, arg_int2, by = "Region") %>%
select(1, 2, 7, 9, 11, 24)
## Warning: Column `Region` joining factors with different levels, coercing to
## character vector
dist_data2_int[is.na(dist_data2_int)] <- "No integron"
dist_data2_int$Mean_dist.x = gsub("\\b0\\b", "No IS",
dist_data2_int$Mean_dist.x)
dist_data2_int$Mean_dist.x = gsub("[0-9]{1,}", "IS",
dist_data2_int$Mean_dist.x)
dist_data2_int$Mean_dist.x = gsub("\\.IS", "", dist_data2_int$Mean_dist.x)
dist_data2_int2 = dist_data2_int %>% group_by(Replicon.x,
Mean_dist.x, Integron, Mechanism.x) %>% summarise(no = n())
dist_data2_int2$Integron <- factor(dist_data2_int2$Integron,
levels = c("No integron", "Integron"))
# Subset to only major mechanisms
dist_data2_int3 = dist_data2_int2[which(dist_data2_int2$Mechanism.x ==
"antibiotic_efflux" | dist_data2_int2$Mechanism.x ==
"antibiotic_inactivation" | dist_data2_int2$Mechanism.x ==
"antibiotic_target_alteration" | dist_data2_int2$Mechanism.x ==
"antibiotic_target_protection" | dist_data2_int2$Mechanism.x ==
"antibiotic_target_replacement"), ]
# Rename mechanisms
dist_data2_int3$Mechanism.x = gsub("antibiotic_efflux",
"Antibiotic efflux (AE)", dist_data2_int3$Mechanism.x)
dist_data2_int3$Mechanism.x = gsub("antibiotic_target_replacement",
"Antibiotic target replacement (ATR)", dist_data2_int3$Mechanism.x)
dist_data2_int3$Mechanism.x = gsub("antibiotic_target_alteration",
"Antibiotic target alteration (ATA)", dist_data2_int3$Mechanism.x)
dist_data2_int3$Mechanism.x = gsub("antibiotic_inactivation",
"Antibiotic inactivation (AI)", dist_data2_int3$Mechanism.x)
dist_data2_int3$Mechanism.x = gsub("reduced_permeability_to_antibiotic",
"Reduced permeability to antibiotic (RPA)", dist_data2_int3$Mechanism.x)
dist_data2_int3$Mechanism.x = gsub("antibiotic_target_protection",
"Antibiotic target protection (ATP)", dist_data2_int3$Mechanism.x)
dist_data2_int3$Mean_dist.x <- factor(dist_data2_int3$Mean_dist.x,
levels = c("No IS", "IS"))
dist_data2_int3$Integron <- factor(dist_data2_int3$Integron,
levels = c("No integron", "Integron"))
ggplot(dist_data2_int3, aes(Replicon.x, no, fill = Mechanism.x)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(Mean_dist.x ~ Integron, scales = "free_y",
drop = F) + theme_light() + scale_fill_brewer(palette = "Set1") +
labs(x = "Replicon", y = "# CRLs", fill = "Mechanism")

dist_data2_int3$Mean_dist.x <- factor(dist_data2_int3$Mean_dist.x,
levels = c("IS", "No IS"))
ggplot(dist_data2_int3, aes(Replicon.x, no, fill = Mean_dist.x)) +
geom_bar(stat = "identity", position = "stack") +
facet_wrap(~Integron, drop = F) + theme_light() +
scale_fill_brewer(palette = "Set1") + labs(x = "Replicon",
y = "# CRLs", fill = "Mobilized by IS") + theme(legend.position = c(0.75,
0.8))

# Make a table of the top 10 ARG-MOB AROs per major
# mechanism. Not included in main manuscript or
# supplemental
ai_sub = unique(dist_int_tax_join_filt) %>% filter(Mechanism ==
"Antibiotic inactivation") %>% arrange(desc(ARGMOB)) %>%
head(n = 10) %>% select(1, 2, 4, 5, 3, 6, 10, 12,
14)
ae_sub = unique(dist_int_tax_join_filt) %>% filter(Mechanism ==
"Antibiotic efflux") %>% arrange(desc(ARGMOB)) %>%
head(n = 10) %>% select(1, 2, 4, 5, 3, 6, 10, 12,
14)
aa_sub = unique(dist_int_tax_join_filt) %>% filter(Mechanism ==
"Antibiotic target alteration") %>% arrange(desc(ARGMOB)) %>%
head(n = 10) %>% select(1, 2, 4, 5, 3, 6, 10, 12,
14)
ap_sub = unique(dist_int_tax_join_filt) %>% filter(Mechanism ==
"Antibiotic target protection") %>% arrange(desc(ARGMOB)) %>%
head(n = 10) %>% select(1, 2, 4, 5, 3, 6, 10, 12,
14)
ar_sub = unique(dist_int_tax_join_filt) %>% filter(Mechanism ==
"Antibiotic target replacement") %>% arrange(desc(ARGMOB)) %>%
head(n = 10) %>% select(1, 2, 4, 5, 3, 6, 10, 12,
14)
all_mech_top10 = rbind(ai_sub, ae_sub, aa_sub, ap_sub,
ar_sub) %>% arrange(desc(ARGMOB))
# Merge with protein/nucleotide accession numbers
# from CARD
card_accs = read.csv("aro_acc.txt", sep = " ", header = F)
colnames(card_accs) = c("ARO", "Prot. acc.", "Nuc. acc.")
card_accs$ARO = as.character(card_accs$ARO)
card_accs$ARO = gsub("ARO:", "", card_accs$ARO)
colnames(all_mech_top10) = c("ARO", "CRL occurrences",
"Mechanism", "Submech.", "IS\nratio", "Replicon\nratio",
"Integron\nratio", "Simpson\nindex", "ARG-MOB")
all_mech_top10_acc = all_mech_top10 %>% left_join(card_accs,
by = "ARO")
all_mech_top10_acc$Mechanism = gsub("_", " ", all_mech_top10_acc$Mechanism)
all_mech_top10_acc$Submech. = gsub("_", " ", all_mech_top10_acc$Submech.)
colnames(all_mech_top10_acc) = c("ARO", "CRL occurrences",
"Mechanism", "Submech.", "IS\nratio", "Replicon\nratio",
"Integron\nratio", "Simpson\nindex", "ARG-MOB",
"Prot.acc", "Nuc.acc")
# Kable table kable(all_mech_top10_acc, digits =
# 2,longtable = T) %>% column_spec(2, width =
# '0.8cm') %>% column_spec(3, width = '2.5cm') %>%
# column_spec(4, width = '2.5cm') %>%
# column_spec(5:9, width = '0.6cm') %>%
# kable_styling(bootstrap_options = c('striped',
# 'hover', 'condensed'), latex_options =
# c('hold_position', 'repeat_header','scale_down'),
# font_size = 7)
datatable(all_mech_top10_acc) %>% formatRound(columns = 5:9,
digits = 3)
# Export as csv for inserting into word
# supplemental information
# colnames(all_mech_top10_acc) = c('ARO','CRL
# occurrences','Mechanism','Submech.','IS ratio',
# 'Replicon ratio','Integron ratio','Simpson
# index','ARG-MOB', 'Prot.acc','Nuc.acc')
# write.csv2(all_mech_top10_acc,'all_mech_top10_acc.csv',sep
# = '\t')
Defining ARG-MOB categories
Based on manual estimates from ARG-MOB densities, groups of low to high ARG-MOB are defined. First, the density of ARG-MOB is plotted for all categories
ggplot(dist_int_tax_join, aes(ARGMOB,stat(count))) +
geom_density() +
scale_x_continuous(breaks=seq(0,1,0.05)) +
theme_light()

#ARGMOB categories
#Very low: ARGMOB = 0
#Low: ARGMOB > 0 < 0.175
#Medium: ARGMOB > 0.175 < 0.375
#High: ARGMOB > 0.375 < 0.675
#Very high ARGMOB > 0.675
#Find local minima
dens = density(dist_int_tax_join$ARGMOB)
#Adjust y coordinates by the number of observations. This is also done in the ggplot geom_density function
dens$y = dens$y * nrow(dist_int_tax_join)
densy=dens$y
densx=dens$x
low_minimum = densx[which(densy == min(densy[densx < 0.2 & densx > 0]))]
medium_minimum = densx[which(densy == min(densy[densx < 0.5 & densx > 0.2]))]
#Cutoff between high and very high is more tricky, as there is no minimum (valley), only a change in slope. Isolate two intervals of data and fit straight lines on the data. The cutoff is where the two line intersect.
high_interval = data.frame(cbind(densx[densx < 0.65 & densx > 0.55],densy[densx < 0.65 & densx > 0.55]))
colnames(high_interval) = c("x","y")
#Check that the range is reasonibly linear
plot(high_interval)

high_line = lm(y ~ x, high_interval)
high_coef = t(high_line$coefficients)
colnames(high_coef) = c("intercept","slope")
vhigh_interval = data.frame(cbind(densx[densx < 1 & densx > 0.7],densy[densx < 1 & densx > 0.7]))
colnames(vhigh_interval) = c("x","y")
#Check that the range is reasonibly linear
plot(vhigh_interval)

vhigh_line = lm(y ~ x, vhigh_interval)
vhigh_coef = t(vhigh_line$coefficients)
colnames(vhigh_coef) = c("intercept","slope")
#Where do the lines intersect? This is the cutoff between high and very high ARG-MOB
line_intersect <- rbind(coef(high_line),coef(vhigh_line))
high_minimum = c(-solve(cbind(line_intersect[,2],-1)) %*% line_intersect[,1])
high_minimum = high_minimum[1]
#vlow = 0
#low = 0.175
#medium = 0.385
#high = 0.75
#vhigh = 0.75
vlow = 0
low = low_minimum
medium = medium_minimum
high = high_minimum
vhigh = high_minimum
#Plot thresholds in vallyes and the high-very high intersection
ggplot(dist_int_tax_join, aes(ARGMOB,stat(count))) +
geom_density() +
scale_x_continuous(breaks=seq(0,1,0.05)) +
theme_light() +
annotate("rect",xmin=0,xmax=low, ymin=0,ymax=3700, fill = "#ABDDA4", alpha = 0.2) +
annotate("text",x = low/2,y = 3800, label = "Low") +
annotate("rect",xmin=low,xmax=medium, ymin=0,ymax=3700, fill = "#FFFFBF", alpha = 0.2) +
annotate("text",x = 0.28,y = 3800, label = "Medium") +
annotate("rect",xmin=medium,xmax=high, ymin=0,ymax=3700, fill = "#FDAE61", alpha = 0.2) +
annotate("text",x = 0.53,y = 3800, label = "High") +
annotate("rect",xmin=high,xmax=1, ymin=0,ymax=3700, fill = "#D7191C", alpha = 0.2) +
annotate("text",x = 0.83,y = 3800, label = "Very high") +
geom_abline(slope = high_coef[,2], intercept = high_coef[,1],color = "red", linetype = 2) +
geom_abline(slope = vhigh_coef[,2], intercept = vhigh_coef[,1], color = "red",linetype = 2) +
geom_vline(xintercept = high_minimum) +
geom_vline(xintercept = medium_minimum) +
geom_vline(xintercept = low_minimum) +
labs(x = "ARG-MOB score", y = "Normalized count") +
ggtitle("Smoothed kernel density estimate\nand ARG-MOB groupings") +
theme(plot.margin = unit(c(5.5, 20, 5.5, 5.5), "points"))

#Make a prettier plot
#Subset to only major mechanisms
dist_int_tax_join2 = dist_int_tax_join %>% filter(Mechanism %in% c("Antibiotic efflux",
"Antibiotic inactivation",
"Antibiotic target replacement",
"Antibiotic target protection",
"Antibiotic target alteration"))
dist_int_tax_join_cat = dist_int_tax_join %>%
mutate(ARGMOB_category = ifelse(ARGMOB == vlow, "Very low",
ifelse(ARGMOB > vlow & ARGMOB < low, "Low",
ifelse(ARGMOB >= low & ARGMOB < medium,"Medium",
ifelse(ARGMOB >= medium & ARGMOB < high, "High",
ifelse(ARGMOB >= high, "Very high",NA))))))
dist_int_tax_join_cat$ARGMOB_category <- factor(dist_int_tax_join_cat$ARGMOB_category,
levels = c("Very high","High","Medium","Low","Very low"))
colours = c("All mechanisms" = "black", "Antibiotic efflux" = "#A6CEE3", "Antibiotic inactivation" = "#1F78B4", "Antibiotic target alteration" = "#B2DF8A", "Antibiotic target protection" = "#33A02C", "Antibiotic target replacement" = "#FB9A99")
p_ARGMOB = ggplot(unique(dist_int_tax_join2), aes(ARGMOB,stat(count), color = Mechanism)) +
geom_vline(xintercept = c(vlow,low,medium,high,vhigh)) +
annotate("rect",xmin=0,xmax=low, ymin=0,ymax=Inf, fill = "#ABDDA4", alpha = 0.2) +
annotate("text",x = low/2,y = 3800, label = "italic(Low)", parse =T) +
annotate("rect",xmin=low,xmax=medium, ymin=0,ymax=Inf, fill = "#FFFFBF", alpha = 0.2) +
annotate("text",x = 0.28,y = 3800, label = "italic(Medium)", parse =T) +
annotate("rect",xmin=medium,xmax=high, ymin=0,ymax=Inf, fill = "#FDAE61", alpha = 0.2) +
annotate("text",x = 0.53,y = 3800, label = "italic(High)", parse =T) +
annotate("rect",xmin=high,xmax=1, ymin=0,ymax=Inf, fill = "#D7191C", alpha = 0.2) +
annotate("text",x = 0.83,y = 3800, label = "Very high", parse =F, fontface = 'italic') +
geom_density(size = 2) +
geom_density(data = unique(dist_int_tax_join),
aes(ARGMOB,stat(count), colour = "All mechanisms"),
size = 2, linetype = 1) +
scale_x_continuous(breaks=seq(0,1,0.05)) +
theme_light() +
#scale_color_brewer(palette = "Paired") +
labs(x = "ARG-MOB",y="Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom",
plot.title = element_text(face = "bold")) +
#ggtitle('A: Density of ARG-MOB per mechanism') +
ggtitle("a") +
guides(col = guide_legend(nrow = 3)) +
scale_color_manual(values = colours)
dist_int_tax_join_cat2 = dist_int_tax_join_cat %>%
filter(Mechanism %in% c("Antibiotic efflux",
"Antibiotic inactivation",
"Antibiotic target replacement",
"Antibiotic target protection",
"Antibiotic target alteration"))
dist_int_tax_join_cat2$Mechanism <- factor(dist_int_tax_join_cat2$Mechanism,levels = c("Antibiotic inactivation",
"Antibiotic efflux",
"Antibiotic target alteration",
"Antibiotic target protection",
"Antibiotic target replacement"))
p_ARGMOB2 = ggplot(unique(dist_int_tax_join_cat2), aes(Mechanism, fill = ARGMOB_category)) +
geom_bar(stat = "count") +
theme_light() +
theme(legend.position = "bottom",
#axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold"),
legend.text = element_text(face = "italic")) +
scale_fill_brewer(palette = "Spectral") +
labs(fill = "ARG-MOB category",y = "Count") +
#ggtitle("B: Category count per mechanism")
ggtitle("b") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15))
#Difficult to format this figure properly in Rmarkdown
ggarrange(p_ARGMOB, p_ARGMOB2, widths = c(1,1.2))

ggplot(unique(dist_int_tax_join2), aes(ARGMOB,stat(count))) +
geom_density(size = 2) +
geom_vline(xintercept = c(vlow,low,medium,high,vhigh)) +
annotate("text",x = low/2,y = 0, label = "Low", angle = 0,vjust=0) +
annotate("text",x = 0.28,y = 0, label = "Medium", angle = 0,vjust=0) +
annotate("text",x = 0.53,y = 0, label = "High", angle =0,vjust=0) +
annotate("text",x = 0.83,y = 0, label = "Very high", angle = 0,vjust=0) +
scale_x_continuous(breaks=seq(0,1,0.1)) +
theme_light() +
labs(x = "ARG-MOB",y="Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom") +
guides(col = guide_legend(nrow = 3)) +
facet_wrap(~Mechanism, scales = "free_y")

Extract high and very high ARG-MOB AROs and plot them
# Plot only high and very high AROs
top_sub = unique(dist_int_tax_join_filt) %>% filter(ARGMOB >
medium) %>% group_by(Submechanism) %>% filter(n() >
1)
# How many AI are high and very high?
top_sub %>% filter(Mechanism == "Antibiotic inactivation") %>%
filter(ARGMOB > medium & ARGMOB < high) %>% ungroup() %>%
tally()
## # A tibble: 1 x 1
## n
## <int>
## 1 145
top_sub %>% filter(Mechanism == "Antibiotic inactivation") %>%
filter(ARGMOB > high) %>% ungroup() %>% tally()
## # A tibble: 1 x 1
## n
## <int>
## 1 53
# What is the percentage of ATR high and very high?
atr_num = as.numeric(unique(dist_int_tax_join) %>%
filter(Mechanism == "Antibiotic target replacement") %>%
tally())
atr_high = as.numeric(unique(dist_int_tax_join) %>%
filter(Mechanism == "Antibiotic target replacement") %>%
filter(ARGMOB > medium) %>% tally())
atr_high/atr_num * 100
## [1] 64.10256
# Get submechanisms that have at least two AROs in
# High or Very high categories.
high_subs = unique(top_sub$Submechanism)
high_subs_df = dist_int_tax_join_filt[dist_int_tax_join_filt$Submechanism %in%
high_subs, ]
top_sub = top_sub %>% mutate(ARO = reorder(Submechanism,
ARGMOB, mean))
high_subs_df = high_subs_df %>% group_by(Submechanism) %>%
mutate(ARO = reorder(Submechanism, ARGMOB, mean))
high_subs_df$Submechanism = gsub("_", " ", high_subs_df$Submechanism)
# vlow = 0 low = 0.175 medium = 0.385 high = 0.75
# vhigh = 0.75
vlow = 0
low = low_minimum
medium = medium_minimum
high = high_minimum
vhigh = high_minimum
colours = c(`Antibiotic efflux` = "#A6CEE3", `Antibiotic inactivation` = "#1F78B4",
`Antibiotic target alteration` = "#B2DF8A", `Antibiotic target protection` = "#33A02C",
`Antibiotic target replacement` = "#FB9A99")
# Shorten names for plotting
high_subs_df$Submechanism = gsub("beta-lactamase",
"b-l", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("transferase", "trans.",
high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("reductase", "red.",
high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("resistant", "res.",
high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("resistance", "res.",
high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("dfr", "", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("protein", "", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("rRNA", "", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("phosphoethanolamine",
"PEA", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("major facilitator superfamily \\(MFS\\)",
"MFS", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("\\(LNU\\)", "", high_subs_df$Submechanism)
high_subs_df$Submechanism = gsub("ribosomal protection",
"rib. prot.", high_subs_df$Submechanism)
# Submechanisms with High and Very high ARG-MOB
# AROs
p_med_high_aros = ggplot(high_subs_df, aes(x = reorder(Submechanism,
-ARGMOB, FUN = median, ), ARGMOB)) + geom_hline(yintercept = high,
linetype = "dashed") + geom_hline(yintercept = medium,
linetype = "dashed") + geom_hline(yintercept = low,
linetype = "dashed") + annotate("rect", xmin = -Inf,
xmax = Inf, ymin = high, ymax = 1, fill = "#D7191C",
alpha = 0.2) + annotate("rect", xmin = -Inf, xmax = Inf,
ymin = medium, ymax = high, fill = "#FDAE61", alpha = 0.2) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = low,
ymax = medium, fill = "#FFFFBF", alpha = 0.2) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = 0,
ymax = low, fill = "#ABDDA4", alpha = 0.2) +
geom_boxplot(outlier.alpha = 0) + theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1,
size = 7.5), axis.title.x = element_blank(),
legend.position = "bottom", legend.direction = "horizontal",
legend.margin = margin(), plot.title = element_text(face = "bold")) +
geom_jitter(aes(size = Total_occurrences, color = Mechanism),
alpha = 0.8) + labs(size = "CRL occurrences",
y = "ARG-MOB") + scale_color_manual(values = colours,
labels = c("Antibiotic efflux (AE)", "Antibiotic inactivation",
"Antibiotic target alteration (ATA)", "Antibiotic target protection (ATP)",
"Antibiotic target replacement (ATR)")) + # ggtitle('Submechanisms with High and Very high
# ARG-MOB AROs') +
ggtitle("c") + # guides(colour = guide_legend(override.aes =
# list(size=6),nrow=2,byrow=TRUE)) +
guides(colour = FALSE) + scale_y_continuous(breaks = seq(0,
1, 0.1)) + scale_x_discrete(labels = function(x) str_wrap(x,
width = 22))
# Composite plot
grid.arrange(p_ARGMOB, p_ARGMOB2, p_med_high_aros,
nrow = 2, layout_matrix = rbind(c(1, 2), c(3)),
heights = c(1, 1))

# Mean of the VIM beta-lactamases
vim_aros = dist_int_tax_join_filt %>% filter(Submechanism ==
"VIM_beta-lactamase")
vim_aros$weighted_mean = vim_aros$Total_occurrences *
vim_aros$ARGMOB
vim_mean = sum(vim_aros$weighted_mean)/sum(vim_aros$Total_occurrences)
# Not used
weighted_mean_AROs = dist_int_tax_join_filt %>% filter(Total_occurrences >
1) %>% mutate(weighted_mean = Total_occurrences *
ARGMOB) %>% group_by(Submechanism) %>% summarise(`Weighted mean ARG-MOB` = sum(weighted_mean)/sum(Total_occurrences),
CRLs = sum(Total_occurrences), `Unweighted median ARG-MOB` = median(ARGMOB))
datatable(weighted_mean_AROs)
# Print an interactive table of high and very high
# ARG-MOB AROs. NOT USED IN MANUSCRIPT
top_sub = dist_int_tax_join_filt %>% filter(ARGMOB >
medium) %>% group_by(Submechanism) %>% filter(n() >
1)
top_sub_table = top_sub %>% unique() %>% select(1,
4, 5, 2, 3, 6, 10, 11, 12, 14)
colnames(top_sub_table) = c("ARO", "Mechanism", "Submechanism",
"CRLs", "IS ratio", "Replicon ratio", "Integron ratio",
"# unique genera", "Simpson index", "ARG-MOB")
datatable(top_sub_table) %>% formatRound(columns = c("IS ratio",
"Replicon ratio", "Integron ratio", "Simpson index",
"ARG-MOB"), digits = 3)
Some AROs have large variance from the mean IS and replicon ratios
AROs with large variance (e.g. oqxAB) are usually not mobilized but have been in many cases. Investigate these and find AROs with largest spread from mean.
##### Calculate which AROs have largest variance for IS
##### and Replicon ratios based on genera ####
aro_tax = read.csv("aro_tax.Rdat", sep = "\t", header = F)
colnames(aro_tax) = c("Region", "ARO", "Up_dist", "Down_dist",
"IS_up", "IS_down", "Replicon", "Sim_cutoff", "Mean_dist",
"Cluster_size", "Mechanism", "Submechanism", "Acc.",
"Superkingdom", "Kingdom", "Phylum", "Class", "Order",
"Family", "Genus")
aro_tax$Mechanism = gsub("antibiotic_efflux", "Antibiotic efflux",
aro_tax$Mechanism)
aro_tax$Mechanism = gsub("antibiotic_target_replacement",
"Antibiotic target replacement", aro_tax$Mechanism)
aro_tax$Mechanism = gsub("antibiotic_target_alteration",
"Antibiotic target alteration", aro_tax$Mechanism)
aro_tax$Mechanism = gsub("antibiotic_inactivation",
"Antibiotic inactivation", aro_tax$Mechanism)
aro_tax$Mechanism = gsub("reduced_permeability_to_antibiotic",
"Reduced permeability to antibiotic", aro_tax$Mechanism)
aro_tax$Mechanism = gsub("antibiotic_target_protection",
"Antibiotic target protection", aro_tax$Mechanism)
aro_tax$ARO = gsub("ARO:", "", aro_tax$ARO)
# If superkingdom is empty, remove the line aro_tax
# <- aro_tax[-which(aro_tax$Superkingdom == ''), ]
aros = unique(as.vector(aro_tax$ARO))
# Try to make a ratio of the IS spread and use it
# in the general classification
df_sd_tax <- data.frame(ARO = character())
df_sd_tax2 <- data.frame(ARO = character())
# If the below loop fails, it helps to restart R
# for some reason.
for (ARO in aros) {
temp <- aro_tax[which(aro_tax$ARO == ARO), ]
mech = unique(as.character(temp$Mechanism))
temp2 = temp %>% group_by(Genus) %>% summarise(mean_dist = mean(Mean_dist),
zeroes = sum(Mean_dist == 0), plasmids = sum(Replicon ==
"Plasmid"), chrs = sum(Replicon == "Chr"),
ratio = sum(Mean_dist > 0)/(sum(Mean_dist ==
0) + sum(Mean_dist > 0)), occurrences = n(),
replicon_type = sum(Replicon == "Plasmid")/(sum(Replicon ==
"Plasmid") + sum(Replicon == "Chr")))
# Filter out ARO:Genus combinations that occur only
# once (singletons). Many of these are likely not
# correctly annotated genera
temp2_2 = temp2[which(temp2$occurrences > 1), ]
sum_occur = sum(temp2_2$occurrences)
is_mean = (sum(temp2_2$occurrences) - sum(temp2_2$zeroes))/sum(temp2_2$occurrences)
temp2_2$is_var = temp2_2$ratio - is_mean
rep_mean = sum(temp2_2$plasmids)/sum(temp2_2$occurrences)
temp2_2$rep_var = temp2_2$replicon_type - rep_mean
# Filter out ARO:Genus combinations that occur only
# once (singletons). Many of these are likely not
# correctly annotated genera
temp3 = temp2[which(temp2$occurrences > 1), ] %>%
summarise(I_Variance = var(ratio), I_SD = sd(ratio),
I_Mean = mean(ratio), R_Variance = var(replicon_type),
R_SD = sd(replicon_type), R_Mean = mean(replicon_type))
temp4 = as.data.frame(cbind(ARO, mech, temp3))
if (nrow(temp2_2) > 0) {
temp5 = as.data.frame(cbind(ARO, mech, temp2_2))
df_sd_tax2 = rbind(df_sd_tax2, temp5)
} else {
df_sd_tax2 = df_sd_tax2
}
df_sd_tax = rbind(df_sd_tax, temp4)
}
df_sd_tax2_3 = df_sd_tax2 %>% group_by(mech, Genus) %>%
summarise(Positive_spread = sum(is_var[is_var >
0]), Negative_spread = sum(is_var[is_var <
0]), Positive_spread_rep = sum(rep_var[rep_var >
0]), Negative_spread_rep = sum(rep_var[rep_var <
0])) %>% arrange(desc(Positive_spread)) %>%
filter(Positive_spread + abs(Negative_spread) >
1) %>% filter(mech != "Antibiotic efflux;reduced_permeability_to_antibiotic")
p1 = ggplot(df_sd_tax2_3) + geom_bar(aes(x = reorder(Genus,
Positive_spread, FUN = sum), y = Positive_spread,
fill = mech), stat = "identity") + geom_bar(aes(x = reorder(Genus,
Positive_spread, FUN = sum), y = Negative_spread,
fill = mech), stat = "identity") + coord_flip() +
theme_light() + geom_hline(yintercept = 0) + scale_fill_brewer(palette = "Paired") +
scale_x_discrete(position = "top") + labs(y = "Summed IS mob. spread",
fill = "Mechanism") + theme(legend.position = "none",
axis.title.y = element_blank())
p2 = ggplot(df_sd_tax2_3) + geom_bar(aes(x = reorder(Genus,
Positive_spread, FUN = sum), y = Positive_spread_rep,
fill = mech), stat = "identity") + geom_bar(aes(x = reorder(Genus,
Positive_spread, FUN = sum), y = Negative_spread_rep,
fill = mech), stat = "identity") + coord_flip() +
theme_light() + geom_hline(yintercept = 0) + scale_fill_brewer(palette = "Paired") +
theme(legend.position = "none", axis.title.y = element_blank(),
axis.text.y = element_blank()) + labs(y = "Summed Rep. mob. spread")
p_legend = ggplot(df_sd_tax2_3) + geom_bar(aes(x = reorder(Genus,
Positive_spread, FUN = sum), y = Positive_spread,
fill = mech), stat = "identity") + scale_fill_brewer(palette = "Paired") +
labs(y = "Summed IS mob. spread", fill = "Mechanism") +
theme(legend.position = "bottom") + guides(fill = guide_legend(nrow = 2))
legend <- cowplot::get_legend(p_legend)
grid.arrange(legend, p1, p2, nrow = 2, layout_matrix = rbind(c(2,
3), c(1)), heights = c(10, 1), top = textGrob("Summed spread from means of ratios per ARO grouped by genus",
gp = gpar(fontsize = 14, font = 1)))

# Group by ARO only to find the most diverging ARGs
df_sd_tax2_4 = df_sd_tax2 %>% group_by(ARO, mech) %>%
summarise(Positive_spread = sum(is_var[is_var >
0]), Negative_spread = sum(is_var[is_var <
0]), Positive_spread_rep = sum(rep_var[rep_var >
0]), Negative_spread_rep = sum(rep_var[rep_var <
0]), test_rat = sum(ratio)/sum(abs(is_var))) %>%
arrange(desc(Positive_spread)) %>% filter(Positive_spread +
abs(Negative_spread) > 1) %>% filter(mech != "Antibiotic efflux;Reduced permeability to antibiotic") %>%
filter(mech != "Antibiotic target alteration;Antibiotic target replacement") %>%
filter(mech != "Antibiotic efflux;reduced_permeability_to_antibiotic")
df_sd_tax2_5 = melt(df_sd_tax2_4, id.vars = c("ARO",
"mech"), measure.vars = c("Positive_spread", "Negative_spread",
"Positive_spread_rep", "Negative_spread_rep"))
p5 = ggplot(df_sd_tax2_5) + geom_bar(aes(x = reorder(ARO,
value, FUN = sum), y = value, fill = variable),
stat = "identity") + coord_flip() + theme_light() +
geom_hline(yintercept = 0) + scale_fill_manual(values = c("red",
"red", "blue", "blue")) + scale_x_discrete(position = "top") +
labs(y = "Summed IS (blue) and Replicon (red) spread",
fill = "Parameter", x = "AROs") + theme(legend.position = "non") +
facet_wrap(~mech, scales = "free_y", nrow = 1)
p5

# Show oqxA example as table
oqxA_table = df_sd_tax2 %>% filter(ARO == "3003922") %>%
select("Genus", "occurrences", "chrs", "plasmids",
"zeroes", "ratio", "replicon_type") %>% arrange(desc(occurrences))
colnames(oqxA_table) = c("# Identified in total", "# Found on chromosome",
"# Found on plasmid", "#No IS association", "IS ratio",
"Replicon ratio")
oqxA_table
## # Identified in total # Found on chromosome # Found on plasmid
## 1 Klebsiella 458 457
## 2 Enterobacter 124 123
## 3 Escherichia 36 2
## 4 Salmonella 20 3
## 5 Raoultella 14 14
## 6 Citrobacter 8 8
## 7 Kosakonia 7 7
## 8 Lelliottia 6 6
## 9 Cedecea 5 5
## 10 Phytobacter 2 2
## 11 unclassified_Enterobacteriaceae 2 2
## #No IS association IS ratio Replicon ratio NA
## 1 1 387 0.15502183 0.002183406
## 2 1 111 0.10483871 0.008064516
## 3 34 0 1.00000000 0.944444444
## 4 17 0 1.00000000 0.850000000
## 5 0 13 0.07142857 0.000000000
## 6 0 8 0.00000000 0.000000000
## 7 0 4 0.42857143 0.000000000
## 8 0 6 0.00000000 0.000000000
## 9 0 3 0.40000000 0.000000000
## 10 0 2 0.00000000 0.000000000
## 11 0 2 0.00000000 0.000000000
A sheet with all relevant information for all evaluated 1176 AROs is exported. This table is provided as supplemental information and can be used as a lookup table for finding highly mobilized AROs (or AROs with other relevant traits)
# Add the spreads to the other data for exporting
# as excel sheet
df_sd_tax2_6 = df_sd_tax2 %>% group_by(ARO, mech) %>%
summarise(Positive_spread = sum(is_var[is_var >
0]), Negative_spread = sum(is_var[is_var <
0]), Positive_spread_rep = sum(rep_var[rep_var >
0]), Negative_spread_rep = sum(rep_var[rep_var <
0]), test_rat = sum(ratio)/sum(abs(is_var))) %>%
arrange(desc(Positive_spread))
tbl_export = df_sd_tax2_6 %>% full_join(dist_int_tax_join,
by = "ARO") %>% select("ARO", "Mechanism", "Submechanism",
"Total_occurrences", "IS_ratio", "Replicon_ratio",
"Int_ratio", "Simpson_index", "ARGMOB", "unique_genera",
"Int_occurrences", "Int_IS_ratio", "Int_replicon_ratio",
"Positive_spread", "Negative_spread", "Positive_spread_rep",
"Negative_spread_rep")
# Join with df from main analysis - requires that
# main analysis has been run
# First remake df to make sure the format is
# correct
dist_data = read.csv("ARG_IS_distances_mech.txt", sep = "\t",
header = F)
colnames(dist_data) = c("Region", "ARO", "Updist",
"Downdist", "IS_Up", "IS_Down", "Replicon", "Sim_cutoff",
"Mean_dist", "Cluster_size", "Mechanism", "Submechanism")
dist_data$Cluster_size = gsub("size_", "", dist_data$Cluster_size)
# For each region with an ARG, there can be up to
# two closest IS element: upstream and downstream.
# The distance to these are denoted as Updist and
# Downdist, while the Mean_dist column is the mean
# of the two. A distance of 0 indicates that no IS
# element was found. Cluster_size refers to how
# many individual regions were clustered to form a
# given CRL Region.
dist_data2 = dist_data
dist_data2$ARO = gsub("ARO:", "", dist_data2$ARO)
# Make a new dataframe where various metric are
# calculated and summarized
all_aros = unique(as.vector(dist_data2$ARO))
df <- data.frame(ARO = character(), Occurrences = numeric(),
Ratio = numeric(), Mean_distance = numeric(), SD_distance = numeric(),
Mechanism = character(), Submechanism = character(),
Zero_IS = numeric(), Replicon = character(), Total_seqs = numeric())
for (ARO in all_aros) {
temp <- dist_data2[which(dist_data2$ARO == ARO),
]
Zero_IS = nrow(temp[which(temp$Mean_dist == 0),
])
number_nonzero = nrow(temp[which(temp$Mean_dist >
0), ])
Ratio = number_nonzero/nrow(temp)
Occurrences = nrow(temp)
temp_nonzero = temp[which(temp$Mean_dist > 0),
]
Mean_distance = mean(temp_nonzero$Mean_dist)
SD_distance = sd(temp_nonzero$Mean_dist)
Mechanism = unique(as.vector(temp$Mechanism))
IS_Up = unique(as.vector(temp$IS_Up))
IS_Down = unique(as.vector(temp$IS_Down))
Submechanism = unique(as.vector(temp$Submechanism))
p_rep = nrow(temp[which(temp$Replicon == "Plasmid"),
])
c_rep = nrow(temp[which(temp$Mean_dist == "Chr"),
])
Replicon = p_rep/nrow(temp)
Total_seqs = sum(as.numeric(temp$Cluster_size))
temp2 = as.data.frame(cbind(ARO, Occurrences, Ratio,
Mean_distance, SD_distance, Mechanism, Submechanism,
Zero_IS, Replicon, Total_seqs))
df = rbind(df, temp2)
}
# Make sure that values in columns have a correct
# (numeric) format
df$Occurrences = as.numeric(as.character(df$Occurrences))
df$Mean_distance = as.numeric(as.character(df$Mean_distance))
df$SD_distance = as.numeric(as.character(df$SD_distance))
df$Ratio = as.numeric(as.character(df$Ratio))
df$Zero_IS = as.numeric(as.character(df$Zero_IS))
df$Replicon = as.numeric(as.character(df$Replicon))
df$Total_seqs = as.numeric(as.character(df$Total_seqs))
tbl_export2 = tbl_export %>% full_join(df, by = "ARO") %>%
select(1:4, "Total_seqs", 5:17) %>% full_join(card_accs,
by = "ARO") %>% unique()
colnames(tbl_export2) = c("ARO", "Mechanism", "Submechanism",
"CRLs", "Total unclustered regions", "IS ratio",
"Replicon ratio", "Integron ratio", "Simpson index",
"ARG-MOB", "Occurs in # genera", "# integron cases",
"Ratio of integron cases with IS elements", "Ratio of integron cases on plasmids",
"Summed positive spread from mean of IS ratio",
"Summed negative spread from mean of IS ratio",
"Summed positive spread from mean of replicon ratio",
"Summed negative spread from mean of replicon ratio",
"Example protein accession", "Example nucleotide accession")
datatable(tbl_export2, rownames = FALSE, filter = "top",
options = list(pageLength = 10, scrollX = T)) %>%
formatRound(columns = 6:18, digits = 2)
# columnDefs = list(list()))) %>%
# write.csv2(tbl_export2,file =
# 'ARG_MOB_data.csv',sep = '\t',dec = '.')
Make table and simple plot of Mechanism, CRL, and ARO counts
# Import the overview of mechanism count, generated
# in bash script
loci_count = read.csv("passing_regions_mech", header = F,
sep = " ")
colnames(loci_count) = c("Loci", "Mechanism")
crl_count = dist_data2 %>% group_by(Mechanism) %>%
tally()
colnames(crl_count) = c("Mechanism", "CRLs")
aro_count = df %>% group_by(Mechanism) %>% tally()
colnames(aro_count) = c("Mechanism", "AROs")
# Merge the three count tables
three_counts = full_join(loci_count, crl_count, by = "Mechanism") %>%
full_join(aro_count, by = "Mechanism") %>% select(2,
1, 3, 4) %>% arrange(desc(Loci))
three_counts$Mechanism = gsub("antibiotic_efflux",
"Antibiotic efflux (AE)", three_counts$Mechanism)
three_counts$Mechanism = gsub("antibiotic_target_replacement",
"Antibiotic target replacement (ATR)", three_counts$Mechanism)
three_counts$Mechanism = gsub("antibiotic_target_alteration",
"Antibiotic target alteration (ATA)", three_counts$Mechanism)
three_counts$Mechanism = gsub("antibiotic_inactivation",
"Antibiotic inactivation (AI)", three_counts$Mechanism)
three_counts$Mechanism = gsub("reduced_permeability_to_antibiotic",
"Reduced permeability to antibiotic (RPA)", three_counts$Mechanism)
three_counts$Mechanism = gsub("antibiotic_target_protection",
"Antibiotic target protection (ATP)", three_counts$Mechanism)
three_counts$Mechanism = gsub(";", ";\n", three_counts$Mechanism)
# Make a table for under the plot
tt <- ttheme_default(core = list(bg_params = list(fill = c("#A6CEE3",
"#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C",
"#FDBF6F", "#FF7F00", "#CAB2D6")), fg_params = list(hjust = 0,
x = 0.1)), rowhead = list(fg_params = list(hjust = 0,
x = 0)), padding = unit(c(10, 4), "mm"), base_size = 10,
)
tbl <- tableGrob(three_counts, rows = NULL, theme = tt)
counts_plot = ggplot(three_counts, aes(Loci, AROs,
size = CRLs, color = reorder(Mechanism, desc(Loci),
FUN = sum))) + geom_point() + theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.x = element_text(angle = 45), legend.position = "none") +
scale_color_brewer(palette = "Paired")
grid.arrange(counts_plot, tbl, nrow = 1, widths = c(3,
5))
